Adding the VCF ROD. Also changed the VCF objects to much more user friendly.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1658 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-09-18 20:19:34 +00:00
parent d9ee515a9b
commit 7b39aa4966
24 changed files with 1206 additions and 295 deletions

View File

@ -15,6 +15,8 @@ import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.ArrayList;
/**
@ -123,6 +125,12 @@ public class RodGLF implements ReferenceOrderedDatum, Variation, Iterator<RodGLF
return mRecord.getRefBase().toString();
}
/** are we bi-allelic? */
@Override
public boolean isBiallelic() {
return true;
}
/**
* Returns true if all observed alleles are reference alleles. All is<Variant> methods (where Variant=SNP,Insertion, etc) should
* return false at such site to ensure consistency. This method is included for use with genotyping calls (isGenotype()==true), it makes
@ -154,9 +162,9 @@ public class RodGLF implements ReferenceOrderedDatum, Variation, Iterator<RodGLF
@Override
public char getAlternativeBaseForSNP() {
if (!this.isSNP()) throw new IllegalStateException("we're not a SNP");
if (getAlternateBases().charAt(0) == this.getReference().charAt(0))
return getAlternateBases().charAt(1);
return getAlternateBases().charAt(0);
if (getAlternateBase().charAt(0) == this.getReference().charAt(0))
return getAlternateBase().charAt(1);
return getAlternateBase().charAt(0);
}
@ -168,9 +176,9 @@ public class RodGLF implements ReferenceOrderedDatum, Variation, Iterator<RodGLF
@Override
public char getReferenceForSNP() {
if (!this.isSNP()) throw new IllegalStateException("we're not a SNP");
if (getAlternateBases().charAt(0) == this.getReference().charAt(0))
return getAlternateBases().charAt(0);
return getAlternateBases().charAt(1);
if (getAlternateBase().charAt(0) == this.getReference().charAt(0))
return getAlternateBase().charAt(0);
return getAlternateBase().charAt(1);
}
@ -252,10 +260,22 @@ public class RodGLF implements ReferenceOrderedDatum, Variation, Iterator<RodGLF
* @return a string, of ploidy
*/
@Override
public String getAlternateBases() {
public String getAlternateBase() {
return this.getBestGenotype(0).toString();
}
/**
* gets the alternate bases. Use this method if teh allele count is greater then 2
*
* @return
*/
@Override
public List<String> getAlternateBases() {
List<String> list = new ArrayList<String>();
list.add(this.getAlternateBase());
return list;
}
/**
* Returns minor allele frequency.
*
@ -270,8 +290,7 @@ public class RodGLF implements ReferenceOrderedDatum, Variation, Iterator<RodGLF
@Override
public VARIANT_TYPE getType() {
if (this.isSNP()) return VARIANT_TYPE.SNP;
else if (this.isInsertion()) return VARIANT_TYPE.INDEL;
else if (this.isDeletion()) return VARIANT_TYPE.DELETION;
else if (this.isInsertion() || this.isDeletion()) return VARIANT_TYPE.INDEL;
else return VARIANT_TYPE.REFERENCE;
}

View File

@ -25,20 +25,21 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.Genotype;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class RodGeliText extends BasicReferenceOrderedDatum implements Variation, VariantBackedByGenotype, AllelicVariant {
public enum Genotype {
public enum Genotype_Strings {
AA, AC, AG, AT, CC, CG, CT, GG, GT, TT
}
@ -187,10 +188,22 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
* @return a string, of ploidy
*/
@Override
public String getAlternateBases() {
public String getAlternateBase() {
return this.bestGenotype;
}
/**
* gets the alternate bases. If this is homref, throws an UnsupportedOperationException
*
* @return
*/
@Override
public List<String> getAlternateBases() {
List<String> list = new ArrayList<String>();
list.add(this.getAlternateBase());
return list;
}
public boolean isIndel() {
return false;
}
@ -306,7 +319,7 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
if (genotypeLikelihoods[likelihoodIndex] > bestLikelihood) {
bestLikelihood = genotypeLikelihoods[likelihoodIndex];
bestGenotype = Genotype.values()[likelihoodIndex].toString();
bestGenotype = Genotype_Strings.values()[likelihoodIndex].toString();
}
}
@ -317,8 +330,8 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
}
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
if (refBase == Genotype.values()[likelihoodIndex].toString().charAt(0) &&
refBase == Genotype.values()[likelihoodIndex].toString().charAt(1)) {
if (refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(0) &&
refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(1)) {
refLikelihood = genotypeLikelihoods[likelihoodIndex];
}
}
@ -328,19 +341,28 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
this.lodBtnb = (bestLikelihood - nextBestLikelihood);
}
/**
* get the likelihoods
*
* @return an array in lexigraphical order of the likelihoods
*/
@Override
public org.broadinstitute.sting.utils.genotype.Genotype getGenotype(DiploidGenotype x) {
// figure out the best -switch to this
/*Integer values[] = Utils.SortPermutation(this.genotypeLikelihoods);
int indexThis = x.ordinal();
int other = (x.toString().equals(Genotype.values()[values[1]])) ? values[9] : values[8];
double lod = (genotypeLikelihoods[indexThis] - genotypeLikelihoods[other]);*/
return new BasicGenotype(getLocation(),x.toString(), refBase, lodBtnb);
public Genotype getGenotype(DiploidGenotype x) {
if (x.toString() != this.getAltBasesFWD()) throw new IllegalStateException("We don't contain that genotype!");
return new BasicGenotype(getLocation(), x.toString(), refBase, lodBtnb);
}
/**
* get the likelihoods
*
* @return an array in lexigraphical order of the likelihoods
*/
@Override
public List<Genotype> getGenotypes() {
List<Genotype> ret = new ArrayList<Genotype>();
ret.add(new BasicGenotype(getLocation(), this.getAltBasesFWD(), refBase, lodBtnb));
return ret;
}
/**

View File

@ -187,10 +187,22 @@ public class RodGenotypeChipAsGFF extends BasicReferenceOrderedDatum implements
* @return
*/
@Override
public String getAlternateBases() {
public String getAlternateBase() {
return this.feature;
}
/**
* gets the alternate bases. Use this method if teh allele count is greater then 2
*
* @return
*/
@Override
public List<String> getAlternateBases() {
List<String> list = new ArrayList<String>();
list.add(this.getAlternateBase());
return list;
}
/**
* get the frequency of this variant
*
@ -247,6 +259,18 @@ public class RodGenotypeChipAsGFF extends BasicReferenceOrderedDatum implements
public boolean isBiallelic() { return true; }
public int length() { return 1; }
/**
* get the likelihoods
*
* @return an array in lexigraphical order of the likelihoods
*/
@Override
public List<Genotype> getGenotypes() {
List<Genotype> ret = new ArrayList<Genotype>();
ret.add(new BasicGenotype(this.getLocation(),this.feature,this.getRefSnpFWD(),this.getConsensusConfidence()));
return ret;
}
/**
* get the likelihoods
*
@ -257,7 +281,7 @@ public class RodGenotypeChipAsGFF extends BasicReferenceOrderedDatum implements
if (!x.toString().equals(this.getAltBasesFWD())) throw new IllegalStateException("Unable to retrieve genotype");
return new BasicGenotype(this.getLocation(),this.feature,this.getRefSnpFWD(),this.getConsensusConfidence());
}
/**
* do we have the specified genotype? not all backedByGenotypes
* have all the genotype data.

View File

@ -0,0 +1,329 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* @author aaron
* <p/>
* Class RodVCF
* <p/>
* An implementation of the ROD for VCF.
*/
public class RodVCF extends BasicReferenceOrderedDatum implements Variation, VariantBackedByGenotype, Iterator<RodVCF> {
// our VCF related information
private VCFReader mReader;
public VCFRecord mCurrentRecord;
public RodVCF(String name) {
super(name);
}
public void assertNotNull() {
if (mCurrentRecord == null) {
throw new UnsupportedOperationException("The current Record is null");
}
}
@Override
public boolean parseLine(Object header, String[] parts) throws IOException {
throw new UnsupportedOperationException("We don't support the parse line");
}
public Object initialize(final File source) throws FileNotFoundException {
if (mReader == null) {
mReader = new VCFReader(source);
}
return mReader.getHeader();
}
@Override
public String toString() {
if (this.mCurrentRecord != null)
return this.mCurrentRecord.toString();
else
return "";
}
public Iterator<RodVCF> createIterator(String name, File file) {
RodVCF vcf = new RodVCF(name);
try {
vcf.initialize(file);
} catch (FileNotFoundException e) {
throw new StingException("Unable to find file " + file);
}
return vcf;
}
/**
* get the frequency of this variant
*
* @return VariantFrequency with the stored frequency
*/
@Override
public double getNonRefAlleleFrequency() {
assertNotNull();
if (this.mCurrentRecord.getInfoValues().containsKey("AF")) {
return Double.valueOf(this.mCurrentRecord.getInfoValues().get("AF"));
} else {
// this is the poor man's AF
if (this.mCurrentRecord.getInfoValues().containsKey("AC") && this.mCurrentRecord.getInfoValues().containsKey("AN")) {
String splt[] = mCurrentRecord.getInfoValues().get("AC").split(",");
if (splt.length > 0) {
return (Double.valueOf(splt[0]) / Double.valueOf(mCurrentRecord.getInfoValues().get("AN")));
}
}
}
return 0.0;
}
/** @return the VARIANT_TYPE of the current variant */
@Override
public VARIANT_TYPE getType() {
if (this.isSNP()) return VARIANT_TYPE.SNP;
else if (this.isIndel()) return VARIANT_TYPE.INDEL;
return VARIANT_TYPE.REFERENCE;
}
/**
* are we a SNP? If not we're a Indel/deletion
*
* @return true if we're a SNP
*/
@Override
public boolean isSNP() {
this.assertNotNull();
if (!mCurrentRecord.hasAlternateAllele())
return false;
for (String alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.length() != 1)
return false;
}
return true;
}
/**
* are we an insertion?
*
* @return true if we are, false otherwise
*/
@Override
public boolean isInsertion() {
this.assertNotNull();
if (!mCurrentRecord.hasAlternateAllele())
return false;
for (String alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.startsWith("I"))
return true;
}
return false;
}
/**
* are we an insertion?
*
* @return true if we are, false otherwise
*/
@Override
public boolean isDeletion() {
this.assertNotNull();
if (!mCurrentRecord.hasAlternateAllele())
return false;
for (String alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.startsWith("D"))
return true;
}
return false;
}
@Override
public GenomeLoc getLocation() {
this.assertNotNull();
return GenomeLocParser.createGenomeLoc(mCurrentRecord.getChromosome(), mCurrentRecord.getPosition());
}
/**
* get the reference base(s) at this position
*
* @return the reference base or bases, as a string
*/
@Override
public String getReference() {
return String.valueOf(mCurrentRecord.getReferenceBase());
}
/** are we bi-allelic? */
@Override
public boolean isBiallelic() {
return (this.getAlternateBases().size() == 1);
}
/**
* get the -1 * (log 10 of the error value)
*
* @return the log based error estimate
*/
@Override
public double getNegLog10PError() {
// we're -10 log(error), we have to divide by 10
return mCurrentRecord.getQual() / 10.0;
}
/**
* are we truely a variant, given a reference
*
* @return false if we're a variant(indel, delete, SNP, etc), true if we're not
*/
@Override
public boolean isReference() {
return (!mCurrentRecord.hasAlternateAllele());
}
/**
* gets the alternate bases. If this is homref, throws an UnsupportedOperationException
*
* @return
*/
@Override
public String getAlternateBase() {
if (!this.isBiallelic()) throw new UnsupportedOperationException("We're not biallelic, so please call getAlternateBases instead");
return this.mCurrentRecord.getAlternateAlleles().get(0);
}
/**
* gets the alternate bases. If this is homref, throws an UnsupportedOperationException
*
* @return
*/
@Override
public List<String> getAlternateBases() {
return this.mCurrentRecord.getAlternateAlleles();
}
/**
* are we an insertion or a deletion? yes, then return true. No? Well, false then.
*
* @return true if we're an insertion or deletion
*/
@Override
public boolean isIndel() {
return (!isSNP());
}
/**
* gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
* of
*
* @return a char, representing the alternate base
*/
@Override
public char getAlternativeBaseForSNP() {
if (!isSNP()) throw new IllegalStateException("we're not a SNP");
return mCurrentRecord.getAlternateAlleles().get(0).charAt(0);
}
/**
* gets the reference base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
*
* @return a char, representing the alternate base
*/
@Override
public char getReferenceForSNP() {
if (!isSNP()) throw new IllegalStateException("we're not a SNP");
return mCurrentRecord.getReferenceBase();
}
/**
* get the genotypes
*
* @return a list of the genotypes
*/
@Override
public List<Genotype> getGenotypes() {
List<Genotype> genotypes = new ArrayList<Genotype>();
if (!this.mCurrentRecord.hasGenotypeData()) {
return genotypes;
}
double refQual = (this.getNegLog10PError());
// add the reference
for (VCFGenotypeRecord rec : mCurrentRecord.getVCFGenotypeRecords()) {
double qual = 0;
if (rec.getAlleles().equals(this.getReference()))
qual = refQual;
else if (rec.getFields().containsKey("GQ"))
qual = Double.valueOf(rec.getFields().get("GQ")) / 10.0;
genotypes.add(new BasicGenotype(this.getLocation(), Utils.join("", rec.getAlleles()), this.getReference().charAt(0), qual));
}
return genotypes;
}
/**
* get the likelihoods
*
* @return an array in lexigraphical order of the likelihoods
*/
@Override
public Genotype getGenotype(DiploidGenotype x) {
if (x.toString().equals(getReference()))
return new BasicGenotype(this.getLocation(), getReference(), this.getReference().charAt(0), 0);
for (VCFGenotypeRecord record : mCurrentRecord.getVCFGenotypeRecords()) {
if (Utils.join("", record.getAlleles()).equals(x.toString())) {
double qual = 0.0;
if (record.getAlleles().equals(this.getReference()))
qual = this.getNegLog10PError();
else if (record.getFields().containsKey("GQ"))
qual = Double.valueOf(record.getFields().get("GQ")) / 10.0;
return new BasicGenotype(this.getLocation(), Utils.join("", record.getAlleles()), this.getReference().charAt(0), qual);
}
}
return null;
}
/**
* do we have the specified genotype? not all backedByGenotypes
* have all the genotype data.
*
* @param x the genotype
*
* @return true if available, false otherwise
*/
@Override
public boolean hasGenotype(DiploidGenotype x) {
if (getGenotype(x) != null)
return true;
return false;
}
@Override
public boolean hasNext() {
return (mReader.hasNext());
}
@Override
public RodVCF next() {
mCurrentRecord = mReader.next();
return this;
}
@Override
public void remove() {
throw new UnsupportedOperationException("You cannot remove from a VCF rod");
}
}

View File

@ -2,17 +2,20 @@ package org.broadinstitute.sting.gatk.refdata;
import net.sf.picard.util.SequenceUtil;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Example format:
* 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1
* 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1
*
* 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1
* 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1
* <p/>
* User: mdepristo
* Date: Feb 27, 2009
* Time: 10:47:14 AM
@ -60,7 +63,9 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
// manipulating the SNP information
//
// ----------------------------------------------------------------------
public GenomeLoc getLocation() { return loc; }
public GenomeLoc getLocation() {
return loc;
}
/**
* get the reference base(s) at this position
@ -86,7 +91,8 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
return strand.equals("+");
}
/** Returns bases in the reference allele as a String. String can be empty (as in insertion into
/**
* Returns bases in the reference allele as a String. String can be empty (as in insertion into
* the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters
* (for longer indels).
*
@ -109,18 +115,18 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
*/
public char getRefSnpFWD() throws IllegalStateException {
//System.out.printf("refbases is %s but %s%n", refBases, toString());
if ( isIndel() ) throw new IllegalStateException("Variant is not a SNP");
if (isIndel()) throw new IllegalStateException("Variant is not a SNP");
// fix - at least this way we ensure that we'll get the other base compared to getAltBasesFWD()
List<String> alleles = getAllelesFWD();
String val = (alleles.get(0).equals(refBases) ? alleles.get(0) : alleles.get(1));
return val.charAt(0);
// if ( onFwdStrand() ) return refBases.charAt(0);
// else return SequenceUtil.reverseComplement(refBases).charAt(0);
// else return SequenceUtil.reverseComplement(refBases).charAt(0);
}
public List<String> getAllelesFWD() {
List<String> alleles = null;
if ( onFwdStrand() )
if (onFwdStrand())
alleles = Arrays.asList(observed.split("/"));
else
alleles = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/"));
@ -148,13 +154,22 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
public VARIANT_TYPE getType() {
return VARIANT_TYPE.SNP;
}// ----------------------------------------------------------------------
//
// What kind of variant are we?
//
// ----------------------------------------------------------------------
public boolean isSNP() { return varType.contains("single"); }
public boolean isInsertion() { return varType.contains("insertion"); }
public boolean isDeletion() { return varType.contains("deletion"); }
public boolean isSNP() {
return varType.contains("single");
}
public boolean isInsertion() {
return varType.contains("insertion");
}
public boolean isDeletion() {
return varType.contains("deletion");
}
/**
* get the base representation of this Variant
@ -162,11 +177,25 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
* @return a string, of ploidy
*/
@Override
public String getAlternateBases() {
public String getAlternateBase() {
return getAllelesFWDString();
}
public boolean isIndel() { return isInsertion() || isDeletion() || varType.contains("in-del"); }
/**
* gets the alternate bases. Use this method if teh allele count is greater then 2
*
* @return
*/
@Override
public List<String> getAlternateBases() {
List<String> list = new ArrayList<String>();
list.add(this.getAlternateBase());
return list;
}
public boolean isIndel() {
return isInsertion() || isDeletion() || varType.contains("in-del");
}
/**
* gets the alternate base is the case of a SNP. Throws an IllegalStateException in the case
@ -178,9 +207,9 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
public char getAlternativeBaseForSNP() {
return getAltSnpFWD(); /*
if (!this.isSNP()) throw new IllegalStateException("we're not a SNP");
if (getAlternateBases().charAt(0) == this.getReference())
return getAlternateBases().charAt(1);
return getAlternateBases().charAt(0); */
if (getAlternateBase().charAt(0) == this.getReference())
return getAlternateBase().charAt(1);
return getAlternateBase().charAt(0); */
}
/**
@ -193,10 +222,17 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
public boolean isReference() { return false; } // snp locations are never "reference", there's always a variant
public boolean isReference() {
return false;
} // snp locations are never "reference", there's always a variant
public boolean isHapmap() { return validationStatus.contains("by-hapmap"); }
public boolean is2Hit2Allele() { return validationStatus.contains("by-2hit-2allele"); }
public boolean isHapmap() {
return validationStatus.contains("by-hapmap");
}
public boolean is2Hit2Allele() {
return validationStatus.contains("by-2hit-2allele");
}
// ----------------------------------------------------------------------
//
@ -205,9 +241,9 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
// ----------------------------------------------------------------------
public String toString() {
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
getLocation().getContig(), getLocation().getStart(), getLocation().getStop()+1,
name, strand, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight );
getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1,
name, strand, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight);
}
public String toSimpleString() {
@ -216,18 +252,18 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
public String toMediumString() {
String s = String.format("%s:%s:%s", getLocation().toString(), name, getAllelesFWDString());
if ( isSNP() ) s += ":SNP";
if ( isIndel() ) s += ":Indel";
if ( isHapmap() ) s += ":Hapmap";
if ( is2Hit2Allele() ) s += ":2Hit";
if (isSNP()) s += ":SNP";
if (isIndel()) s += ":Indel";
if (isHapmap()) s += ":Hapmap";
if (is2Hit2Allele()) s += ":2Hit";
return s;
}
public String repl() {
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
585, getLocation().getContig(), getLocation().getStart()-1, getLocation().getStop(),
name, strand, refBases, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight );
585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(),
name, strand, refBases, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight);
}
public boolean parseLine(final Object header, final String[] parts) {
@ -235,12 +271,12 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
String contig = parts[1];
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop-1));
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1));
name = parts[4];
strand = parts[6];
refBases = parts[7];
if ( strand == "-" )
if (strand == "-")
refBases = BaseUtils.simpleReverseComplement(refBases);
observed = parts[9];
molType = parts[10];
@ -253,69 +289,83 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
weight = Integer.parseInt(parts[17]);
//System.out.printf("Parsed %s%n", toString());
return true;
} catch( MalformedGenomeLocException ex ) {
} catch (MalformedGenomeLocException ex) {
// Just rethrow malformed genome locs; the ROD system itself will deal with these.
throw ex;
} catch( ArrayIndexOutOfBoundsException ex ) {
} catch (ArrayIndexOutOfBoundsException ex) {
// Just rethrow malformed genome locs; the ROD system itself will deal with these.
throw new RuntimeException("Badly formed dbSNP line: " + ex);
} catch ( RuntimeException e ) {
} catch (RuntimeException e) {
System.out.printf(" Exception caught during parsing DBSNP line %s%n", Utils.join(" <=> ", parts));
throw e;
}
}
public String getAltBasesFWD() {
public String getAltBasesFWD() {
List<String> alleles = getAllelesFWD();
return (alleles.get(0).equals(refBases) ? alleles.get(1) : alleles.get(0));
}
}
public char getAltSnpFWD() throws IllegalStateException {
if ( !isSNP() )
public char getAltSnpFWD() throws IllegalStateException {
if (!isSNP())
throw new IllegalStateException("I'm not a SNP");
return getAltBasesFWD().charAt(0);
}
}
public double getConsensusConfidence() {
// TODO Auto-generated method stub
return Double.MAX_VALUE;
}
public double getConsensusConfidence() {
// TODO Auto-generated method stub
return Double.MAX_VALUE;
}
public List<String> getGenotype() throws IllegalStateException {
return Arrays.asList(Utils.join("", getAllelesFWD()));
}
public List<String> getGenotype() throws IllegalStateException {
return Arrays.asList(Utils.join("", getAllelesFWD()));
}
public double getMAF() {
// Fixme: update to actually get MAF
//return avHet;
public double getMAF() {
// Fixme: update to actually get MAF
//return avHet;
return -1;
}
}
public double getHeterozygosity() {
return avHet;
}
public int getPloidy() throws IllegalStateException {
// TODO Auto-generated method stub
return 0;
}
public int getPloidy() throws IllegalStateException {
// TODO Auto-generated method stub
return 0;
}
public double getVariationConfidence() {
// TODO Auto-generated method stub
return Double.MAX_VALUE;
}
public double getVariationConfidence() {
// TODO Auto-generated method stub
return Double.MAX_VALUE;
}
public boolean isGenotype() {
// TODO Auto-generated method stub
return false;
}
public boolean isGenotype() {
// TODO Auto-generated method stub
return false;
}
public boolean isBiallelic() {
// TODO Auto-generated method stub
return observed.indexOf('/')==observed.lastIndexOf('/');
}
public boolean isBiallelic() {
// TODO Auto-generated method stub
return observed.indexOf('/') == observed.lastIndexOf('/');
}
public int length() { return (int)(loc.getStop() - loc.getStart() + 1); }
public int length() {
return (int) (loc.getStop() - loc.getStart() + 1);
}
/**
* get the likelihoods
*
* @return an array in lexigraphical order of the likelihoods
*/
@Override
public List<org.broadinstitute.sting.utils.genotype.Genotype> getGenotypes() {
List<org.broadinstitute.sting.utils.genotype.Genotype> list = new ArrayList<org.broadinstitute.sting.utils.genotype.Genotype>();
list.add(new BasicGenotype(this.getLocation(), this.getAltBasesFWD(), this.getRefSnpFWD(), this.getConsensusConfidence()));
return list;
}
/**
* get the likelihoods
@ -325,7 +375,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
@Override
public org.broadinstitute.sting.utils.genotype.Genotype getGenotype(DiploidGenotype x) {
if (!x.toString().equals(this.getAltBasesFWD())) throw new IllegalStateException("Unable to retrieve genotype");
return new BasicGenotype(this.getLocation(),this.getAltBasesFWD(),this.getRefSnpFWD(),this.getConsensusConfidence());
return new BasicGenotype(this.getLocation(), this.getAltBasesFWD(), this.getRefSnpFWD(), this.getConsensusConfidence());
}
/**
@ -338,6 +388,6 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V
*/
@Override
public boolean hasGenotype(DiploidGenotype x) {
return (!x.toString().equals(this.getAltBasesFWD())) ? false : true;
return (!x.toString().equals(this.getAltBasesFWD())) ? false : true;
}
}

View File

@ -318,7 +318,7 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
if ( VariantsToVCF.generateVCFRecord(context.getTracker(), context.getReferenceContext(), context.getAlignmentContext(true), vcfHeader, gt, map, sampleNames, out, false, false) ) {
if ( !filterFailureString.equals("") )
map.put(VCFHeader.HEADER_FIELDS.FILTER, filterFailureString);
vcfWriter.addRecord(new VCFRecord(vcfHeader, map, "GT:GQ:DP", gt));
vcfWriter.addRecord(new VCFRecord(map, "GT:GQ:DP", gt));
}
}

View File

@ -58,18 +58,18 @@ public class GenotypeConcordance extends BasicVariantAnalysis implements Genotyp
int truthIndex, callIndex;
if (chip == null)
truthIndex = UNKNOWN;
else if (chip.getAlternateBases().equals(g.toString()))
else if (chip.getAlternateBase().equals(g.toString()))
truthIndex = REF;
else if (chip.getAlternateBases().charAt(0) != chip.getAlternateBases().charAt(1))
else if (chip.getAlternateBase().charAt(0) != chip.getAlternateBase().charAt(1))
truthIndex = VAR_HET;
else
truthIndex = VAR_HOM;
if (eval == null)
callIndex = NO_CALL;
else if (eval.getAlternateBases().equals(g.toString()))
else if (eval.getAlternateBase().equals(g.toString()))
callIndex = REF;
else if (eval.getAlternateBases().charAt(0) != eval.getAlternateBases().charAt(1))
else if (eval.getAlternateBase().charAt(0) != eval.getAlternateBase().charAt(1))
callIndex = VAR_HET;
else
callIndex = VAR_HOM;

View File

@ -38,10 +38,10 @@ public class IndelMetricsAnalysis extends BasicVariantAnalysis implements Genoty
else
throw new RuntimeException("Variation is indel, but isn't insertion or deletion!");
if ( eval.getAlternateBases().length() < 100 ) {
sizes[eval.isDeletion() ? 0 : 1][eval.getAlternateBases().length()]++;
if ( eval.getAlternateBases().length() > maxSize )
maxSize = eval.getAlternateBases().length();
if ( eval.getAlternateBase().length() < 100 ) {
sizes[eval.isDeletion() ? 0 : 1][eval.getAlternateBase().length()]++;
if ( eval.getAlternateBase().length() > maxSize )
maxSize = eval.getAlternateBase().length();
}
}

View File

@ -5,8 +5,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.io.*;
import java.util.*;
@ -185,7 +183,7 @@ class PooledConcordanceTable {
public boolean pooledCallIsRef(Variation eval, char ref) {
// code broken out for easy alteration when we start using pool-specific variations
return eval.getAlternateBases().equalsIgnoreCase((Utils.dupString(ref,2)));
return eval.getAlternateBase().equalsIgnoreCase((Utils.dupString(ref,2)));
}
public int calculateNumFrequencyIndeces(int poolSize) {
@ -209,7 +207,7 @@ class PooledConcordanceTable {
for ( Variation eval : evals ) {
if ( mismatchingCalls(firstEval, eval, ref) ) {
// todo -- make this not a StingException but go to the log
throw new StingException("Tri-Allelic Position "+eval.getAlternateBases()+"/"+firstEval.getAlternateBases() + " Ref: "+ ref + " not supported");
throw new StingException("Tri-Allelic Position "+eval.getAlternateBase()+"/"+firstEval.getAlternateBase() + " Ref: "+ ref + " not supported");
} else {
alternateFrequency += calledVariantFrequency(eval,ref);
}
@ -233,10 +231,10 @@ class PooledConcordanceTable {
public boolean mismatchingCalls(Variation eval, Variation chip, char ref) {
// eval and chip guaranteed to be non-null
char chipF = chip.getAlternateBases().charAt(0);
char chipS = chip.getAlternateBases().charAt(1);
char evalF = chip.getAlternateBases().charAt(0);
char evalS = chip.getAlternateBases().charAt(1);
char chipF = chip.getAlternateBase().charAt(0);
char chipS = chip.getAlternateBase().charAt(1);
char evalF = chip.getAlternateBase().charAt(0);
char evalS = chip.getAlternateBase().charAt(1);
boolean mismatch;
if (chipF == ref) {
if ( chipS == ref ) {
@ -261,7 +259,7 @@ class PooledConcordanceTable {
public double calledVariantFrequency( Variation var, char ref ) {
// code broken out for easy alteration when we start using pool-specific variations
String varStr = var.getAlternateBases();
String varStr = var.getAlternateBase();
double freq;
if ( varStr.charAt(0) != ref && varStr.charAt(1) != ref ) {
freq = (double) 2;

View File

@ -31,7 +31,7 @@ public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnal
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
nSNPs += eval == null ? 0 : 1;
if ( this.getMaster().evalContainsGenotypes && eval != null && eval.isSNP() && ((VariantBackedByGenotype)eval).getGenotype( DiploidGenotype.valueOf(eval.getAlternateBases())).isHet() )
if ( this.getMaster().evalContainsGenotypes && eval != null && eval.isSNP() && ((VariantBackedByGenotype)eval).getGenotype( DiploidGenotype.valueOf(eval.getAlternateBase())).isHet() )
nHets++;
return null;

View File

@ -86,7 +86,7 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
if (eval != null) {
char alt = (eval.isSNP()) ? eval.getAlternativeBaseForSNP() : eval.getReference().charAt(0);
if (dbSNP != null && dbSNP.isSNP())
return !dbSNP.getAlternateBases().contains(String.valueOf(alt));
return !dbSNP.getAlternateBase().contains(String.valueOf(alt));
}
return false;
}
@ -117,7 +117,7 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
if (dbsnp.isSNP() && eval.isSNP() && discordantP(dbsnp, eval)) {
return String.format("Discordant [DBSNP %s] [EVAL %s]", dbsnp, eval);
} else if (dbsnp.isIndel() && eval.isSNP()) {
return String.format("SNP-at-indel DBSNP=%s %s", dbsnp.getAlternateBases(), eval);
return String.format("SNP-at-indel DBSNP=%s %s", dbsnp.getAlternateBase(), eval);
} else {
return null;
}

View File

@ -77,7 +77,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
Map<VCFHeader.HEADER_FIELDS,String> map = new HashMap<VCFHeader.HEADER_FIELDS,String>();
if ( generateVCFRecord(tracker, ref, context, vcfheader, gt, map, sampleNames, out, SUPPRESS_MULTISTATE, VERBOSE) ) {
vcfwriter.addRecord(new VCFRecord(vcfheader, map, "GT:GQ:DP", gt));
vcfwriter.addRecord(new VCFRecord(map, "GT:GQ:DP", gt));
//vcfwriter.addRecord(new VCFRecord(vcfheader, map, "GT", gt));
return 1;
}
@ -123,7 +123,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
int allele2 = BaseUtils.simpleBaseToBaseIndex(alleles.get(0).charAt(1));
if (allele2 >= 0 && allele2 != refbase) { alleleCounts[allele2]++; }
gt.add(new VCFGenotypeRecord(name, str, alleles, VCFGenotypeRecord.PHASE.UNPHASED, ref.getBase()));
gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED, str ));
numSNPs++;
snpQual += av.getVariationConfidence();
@ -134,7 +134,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
List<String> alleles = new ArrayList<String>();
alleles.add(ref.getBase() + "" + ref.getBase());
gt.add(new VCFGenotypeRecord(name, str, alleles, VCFGenotypeRecord.PHASE.UNPHASED, ref.getBase()));
gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED, str ));
numRefs++;
}
@ -184,7 +184,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
} else if (field == VCFHeader.HEADER_FIELDS.QUAL) {
map.put(field, String.format("%d", snpQual > 99 ? 99 : (int) snpQual));
} else if (field == VCFHeader.HEADER_FIELDS.FILTER) {
map.put(field, String.valueOf("0"));
map.put(field, "0");
} else if (field == VCFHeader.HEADER_FIELDS.INFO) {
String infostr = ".";
ArrayList<String> info = new ArrayList<String>();

View File

@ -2,6 +2,9 @@ package org.broadinstitute.sting.utils.genotype;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
import java.util.ArrayList;
/**
* User: aaron
* Date: Sep 9, 2009
@ -59,8 +62,7 @@ public class BasicVariation implements Variation {
*/
@Override
public VARIANT_TYPE getType() {
if (mLength > 0) return VARIANT_TYPE.INDEL;
if (mLength < 0) return VARIANT_TYPE.DELETION;
if (mLength != 0) return VARIANT_TYPE.INDEL;
return (isSNP()) ? VARIANT_TYPE.SNP : VARIANT_TYPE.REFERENCE;
}
@ -81,10 +83,22 @@ public class BasicVariation implements Variation {
}
@Override
public String getAlternateBases() {
public String getAlternateBase() {
return mBases;
}
/**
* gets the alternate bases. Use this method if teh allele count is greater then 2
*
* @return
*/
@Override
public List<String> getAlternateBases() {
List<String> list = new ArrayList<String>();
list.add(this.getAlternateBase());
return list;
}
@Override
public GenomeLoc getLocation() {
return mLocation;
@ -95,6 +109,12 @@ public class BasicVariation implements Variation {
return (mRef);
}
/** are we bi-allelic? */
@Override
public boolean isBiallelic() {
return true;
}
@Override
public double getNegLog10PError() {
return mConfidence;
@ -131,9 +151,9 @@ public class BasicVariation implements Variation {
if (!this.isSNP()) throw new IllegalStateException("we're not a SNP");
// we know that if we're a snp, the reference is a single base, so charAt(0) is safe
if (getAlternateBases().charAt(0) == this.getReference().charAt(0))
return getAlternateBases().charAt(1);
return getAlternateBases().charAt(0);
if (getAlternateBase().charAt(0) == this.getReference().charAt(0))
return getAlternateBase().charAt(1);
return getAlternateBase().charAt(0);
}
/**
@ -146,9 +166,9 @@ public class BasicVariation implements Variation {
if (!this.isSNP()) throw new IllegalStateException("we're not a SNP");
// we know that if we're a snp, the reference is a single base, so charAt(0) is safe
if (getAlternateBases().charAt(0) == this.getReference().charAt(0))
return getAlternateBases().charAt(0);
return getAlternateBases().charAt(1);
if (getAlternateBase().charAt(0) == this.getReference().charAt(0))
return getAlternateBase().charAt(0);
return getAlternateBase().charAt(1);
}

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.utils.genotype;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.util.List;
/**
* @author aaron
@ -13,13 +13,16 @@ public interface VariantBackedByGenotype {
/**
* get the likelihoods
*
* @param x the genotype
* @return a map in lexigraphical order of the likelihoods
*/
public List<Genotype> getGenotypes();
/**
* get the likelihoods
*
* @return an array in lexigraphical order of the likelihoods
* @return a map in lexigraphical order of the likelihoods
*/
public Genotype getGenotype(DiploidGenotype x);
/**
* do we have the specified genotype? not all backedByGenotypes
* have all the genotype data.

View File

@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils.genotype;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
/**
* @author aaron
* <p/>
@ -12,7 +14,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
public interface Variation {
// the types of variants we currently allow
public enum VARIANT_TYPE {
SNP, INDEL, DELETION, REFERENCE // though reference is not really a variant
SNP, INDEL, REFERENCE // though reference is not really a variant
}
/**
@ -60,6 +62,9 @@ public interface Variation {
*/
public String getReference();
/** are we bi-allelic? */
public boolean isBiallelic();
/**
* get the -1 * (log 10 of the error value)
*
@ -69,31 +74,43 @@ public interface Variation {
/**
* are we truely a variant, given a reference
*
* @return false if we're a variant(indel, delete, SNP, etc), true if we're not
*/
public boolean isReference();
/**
* gets the alternate bases. If this is homref, throws an UnsupportedOperationException
* gets the alternate base. Use this method if we're biallelic
*
* @return
*/
public String getAlternateBases();
public String getAlternateBase();
/**
* gets the alternate bases. Use this method if teh allele count is greater then 2
*
* @return
*/
public List<String> getAlternateBases();
/**
* are we an insertion or a deletion? yes, then return true. No? Well, false then.
*
* @return true if we're an insertion or deletion
*/
public boolean isIndel();
/**
* gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
* of
* of
*
* @return a char, representing the alternate base
*/
public char getAlternativeBaseForSNP();
/**
* gets the reference base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
*
* @return a char, representing the alternate base
*/
public char getReferenceForSNP();

View File

@ -21,35 +21,33 @@ public class VCFGenotypeRecord {
}
// our phasing
private PHASE phaseType;
// our reference bases(s)
private final char mReferenceBase;
private PHASE mPhaseType;
// our bases(s)
private final List<String> mAlleleBases = new ArrayList<String>();
private final List<String> mGenotypeAlleles = new ArrayList<String>();
// our mapping of the format mFields to values
private final Map<String, String> mFields = new HashMap<String, String>();
// our sample name
private final String mSampleName;
/**
* create a VCF record
* Create a VCF genotype record
*
* @param keyValues the key values
* @param Alleles the alleles, one if we're halpoid, two if we're diploid
* @param phasing the phasing of the the genotype
* @param referenceBase the reference base
* @param sampleName
* @param genotypes
* @param phasing
* @param otherFlags
*/
public VCFGenotypeRecord(String sampleName, Map<String, String> keyValues, List<String> Alleles, PHASE phasing, char referenceBase) {
mSampleName = sampleName;
mReferenceBase = referenceBase;
mFields.putAll(keyValues);
mAlleleBases.addAll(Alleles);
phaseType = phasing;
public VCFGenotypeRecord(String sampleName, List<String> genotypes, PHASE phasing, Map<String, String> otherFlags) {
this.mSampleName = sampleName;
this.mGenotypeAlleles.addAll(genotypes);
this.mPhaseType = phasing;
this.mFields.putAll(otherFlags);
}
/**
* determine the phase of the genotype
*
@ -70,18 +68,55 @@ public class VCFGenotypeRecord {
/** getter methods */
public PHASE getPhaseType() {
return phaseType;
return mPhaseType;
}
public char getReference() {
return mReferenceBase;
public String getSampleName() {
return mSampleName;
}
public List<String> getAllele() {
return mAlleleBases;
public List<String> getAlleles() {
return mGenotypeAlleles;
}
public Map<String, String> getFields() {
return mFields;
}
public String toGenotypeString(List<String> altAlleles) {
String str = "";
boolean first = true;
for (String allele : mGenotypeAlleles) {
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
if (first) {
switch (mPhaseType) {
case UNPHASED:
str += "/";
break;
case PHASED:
str += "|";
break;
case PHASED_SWITCH_PROB:
str += "\\";
break;
case UNKNOWN:
throw new UnsupportedOperationException("Unknown phase type");
}
first = false;
}
}
return str;
}
public boolean equals(Object other) {
if (other instanceof VCFGenotypeRecord) {
if (((VCFGenotypeRecord) other).mPhaseType != this.mPhaseType) return false;
if (!((VCFGenotypeRecord) other).mGenotypeAlleles.equals(this.mGenotypeAlleles)) return false;
if (!((VCFGenotypeRecord) other).mFields.equals(mFields)) return false;
if (!((VCFGenotypeRecord) other).mSampleName.equals(this.mSampleName)) return false;
return true;
}
return false;
}
}

View File

@ -195,9 +195,9 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
genotypeRecords.add(getVCFGenotype(str, mFormatString, tokens[index], values.get(VCFHeader.HEADER_FIELDS.ALT).split(","), values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
index++;
}
return new VCFRecord(mHeader, values, mFormatString, genotypeRecords);
return new VCFRecord(values, mFormatString, genotypeRecords);
}
return new VCFRecord(mHeader, values);
return new VCFRecord(values);
}
/**
@ -216,7 +216,6 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
Map<String, String> tagToValue = new HashMap<String, String>();
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
List<String> bases = new ArrayList<String>();
int addedCount = 0;
String keyStrings[] = formatString.split(":");
for (String key : keyStrings) {
String parse;
@ -236,19 +235,20 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
addAllele(m.group(1), altAlleles, referenceBase, bases);
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
}
tagToValue.put(key, parse);
addedCount++;
else {
tagToValue.put(key, parse);
}
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length());
}
// catch some common errors, either there are too many field keys or there are two many field values
if (keyStrings.length != tagToValue.size())
if (keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0))
throw new RuntimeException("VCFReader: genotype value count doesn't match the key count (expected "
+ keyStrings.length + " but saw " + tagToValue.size() + ")");
else if (genotypeString.length() > 0)
throw new RuntimeException("VCFReader: genotype string contained additional unprocessed fields: " + genotypeString
+ ". This most likely means that the format string is shorter then the value fields.");
return new VCFGenotypeRecord(sampleName, tagToValue, bases, phase, referenceBase);
return new VCFGenotypeRecord(sampleName, bases, phase, tagToValue);
}

View File

@ -1,6 +1,8 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.Utils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -8,90 +10,165 @@ import java.util.Map;
/** the basic VCF record type */
public class VCFRecord {
// required field values
private final Map<VCFHeader.HEADER_FIELDS, String> mValues = new HashMap<VCFHeader.HEADER_FIELDS, String>();
public static final String FIELD_SEPERATOR = "\t";
// the reference base
private char mReferenceBase;
// our contig
private String mChrome;
// our position
private int mPosition;
// our id; set to '.' if not available
private String mID;
// the alternate bases
private final List<String> mAlts = new ArrayList<String>();
// our qual value
private int mQual;
// our filter string
private String mFilterString;
// our info fields
private final Map<String, String> mInfoFields = new HashMap<String, String>();
private final String mGenotypeFormatString;
// our genotype sample fields
private final List<VCFGenotypeRecord> mGenotypeFields;
// the format String, which specifies what each genotype can contain for values
private final String mFormatString;
// the associated header
private final VCFHeader mHeader;
private final List<VCFGenotypeRecord> mGenotypeFields = new ArrayList<VCFGenotypeRecord>();
/**
* given a VCF header, and the values for each of the columns, create a VCF record.
*
* @param header the VCF header
* @param columnValues a mapping of header strings to values
* @param columnValues a mapping of header strings to values
* @param formatString the format string for the genotype records
* @param genotypeRecords the genotype records
*/
public VCFRecord(VCFHeader header, Map<VCFHeader.HEADER_FIELDS, String> columnValues, String formatString, List<VCFGenotypeRecord> genotypeRecords) {
mHeader = header;
mValues.putAll(columnValues);
mFormatString = formatString;
mGenotypeFields = new ArrayList<VCFGenotypeRecord>();
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues, String formatString, List<VCFGenotypeRecord> genotypeRecords) {
extractFields(columnValues);
mGenotypeFields.addAll(genotypeRecords);
mGenotypeFormatString = formatString;
}
/**
* create a VCF record
*
* @param referenceBase the reference base to use
* @param contig the contig this variant is on
* @param position our position
* @param ID our ID string
* @param altBases the list of alternate bases
* @param qual the qual field
* @param filters the filters used on this variant
* @param infoFields the information fields
* @param genotypeObjects the genotype objects
*/
public VCFRecord(char referenceBase,
String contig,
int position,
String ID,
List<String> altBases,
int qual,
String filters,
Map<String, String> infoFields,
String genotypeFormatString,
List<VCFGenotypeRecord> genotypeObjects) {
this.setReferenceBase(referenceBase);
this.mChrome = contig;
this.setPosition(position);
this.mID = ID;
for (String alt : altBases)
this.addAlternateBase(alt);
this.setQual(qual);
this.setFilterString(filters);
this.mInfoFields.putAll(infoFields);
mGenotypeFormatString = genotypeFormatString;
this.mGenotypeFields.addAll(genotypeObjects);
}
/**
* given a VCF header, and the values for each of the columns, create a VCF record.
*
* @param header the VCF header
* @param columnValues a mapping of header strings to values
* @param columnValues a mapping of header strings to values
*/
public VCFRecord(VCFHeader header, Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
mHeader = header;
mValues.putAll(columnValues);
mGenotypeFields = null;
mFormatString = null;
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
extractFields(columnValues);
mGenotypeFormatString = null;
}
/**
* extract the field values from the passed in array
*
* @param columnValues a map of the header fields to values
*/
private void extractFields(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) {
switch (val) {
case CHROM:
this.setChomosome(columnValues.get(val));
break;
case POS:
this.setPosition(Integer.valueOf(columnValues.get(val)));
break;
case ID:
this.setID(columnValues.get(val));
break;
case REF:
if (columnValues.get(val).length() != 1)
throw new IllegalArgumentException("Reference base should be a single character");
this.setReferenceBase(columnValues.get(val).charAt(0));
break;
case ALT:
String values[] = columnValues.get(val).split(",");
for (String alt : values)
addAlternateBase(alt);
break;
case QUAL:
this.setQual(Integer.valueOf(columnValues.get(val)));
break;
case FILTER:
this.setFilterString(columnValues.get(val));
break;
case INFO:
String vals[] = columnValues.get(val).split(";");
for (String alt : vals) {
String keyVal[] = alt.split("=");
if (keyVal.length == 1 && keyVal[0].equals(".")) {
this.addInfoField(keyVal[0], "");
break;
}
if (keyVal.length != 2)
throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt);
this.addInfoField(keyVal[0], keyVal[1]);
}
break;
}
}
}
/**
* do we have genotyping data
*
* @return true if we have genotyping data, false otherwise
*/
public boolean hasGenotypeData() {
if (mGenotypeFields==null) {
if (mGenotypeFields.size() < 1) {
return false;
}
return true;
}
/**
* get the format string
* @return the format sting, null if it doesn't exist
*/
public String getFormatString() {
return mFormatString;
}
/**
* get a required field, given the field tag
*
* @param field the key for the field
*
* @return the field value
*/
public String getValue(VCFHeader.HEADER_FIELDS field) {
return mValues.get(field);
}
/** @return the string for the chromosome that this VCF record is associated with */
public String getChromosome() {
return mValues.get(VCFHeader.HEADER_FIELDS.CHROM);
return this.mChrome;
}
/** @return this VCF records position on the specified chromosome */
public long getPosition() {
return Long.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.POS));
return this.mPosition;
}
/** @return the ID value for this record */
public String getID() {
return mValues.get(VCFHeader.HEADER_FIELDS.ID);
return this.mID;
}
/**
@ -100,8 +177,7 @@ public class VCFRecord {
* @return either A, T, C, G, or N
*/
public char getReferenceBase() {
// TODO: this field isn't validated correctly
return mValues.get(VCFHeader.HEADER_FIELDS.REF).charAt(0);
return this.mReferenceBase;
}
/**
@ -109,20 +185,17 @@ public class VCFRecord {
*
* @return an array of strings representing the alt alleles, or null if there are none
*/
public String[] getAlternateAlleles() {
if (mValues.get(VCFHeader.HEADER_FIELDS.ALT).trim().equals(".")) {
return null;
}
return mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
public List<String> getAlternateAlleles() {
return this.mAlts;
}
public boolean hasAlternateAllele() {
return getAlternateAlleles() != null;
return getAlternateAlleles().size() > 0;
}
/** @return the phred-scaled quality score */
public int getQual() {
return Integer.valueOf(mValues.get(VCFHeader.HEADER_FIELDS.QUAL));
return this.mQual;
}
/**
@ -131,10 +204,10 @@ public class VCFRecord {
* @return an array of strings representing the filtering criteria, or null if none were applied
*/
public String[] getFilteringCodes() {
if (mValues.get(VCFHeader.HEADER_FIELDS.FILTER).trim().equals("0")) {
return null;
}
return mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(";");
//if (this.mFilterString.equals("0")) {
// return null;
//}
return this.mFilterString.split(";");
}
public boolean hasFilteringCodes() {
@ -147,25 +220,18 @@ public class VCFRecord {
* @return a map, of the info key-value pairs
*/
public Map<String, String> getInfoValues() {
Map<String, String> ret = new HashMap<String, String>();
String infoSplit[] = mValues.get(VCFHeader.HEADER_FIELDS.INFO).split(";");
for (String s : infoSplit) {
String keyValue[] = s.split("=");
if (keyValue.length != 2)
throw new RuntimeException("Key value pairs must have both a key and a value; pair: " + s);
ret.put(keyValue[0], keyValue[1]);
}
return ret;
return this.mInfoFields;
}
/** @return the number of columnsof data we're storing */
public int getColumnCount() {
if (this.hasGenotypeData()) return mGenotypeFields.size() + mValues.size();
return mValues.size();
if (this.hasGenotypeData()) return mGenotypeFields.size() + VCFHeader.HEADER_FIELDS.values().length;
return VCFHeader.HEADER_FIELDS.values().length;
}
/**
* return the mapping of the format tags to the specified sample's values
*
* @return a VCFGenotypeRecord
*/
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
@ -174,9 +240,130 @@ public class VCFRecord {
/** @return a List of the sample names */
public String[] getSampleNames() {
String ret[] = new String[mHeader.getGenotypeSamples().size()];
mHeader.getGenotypeSamples().toArray(ret);
return ret;
String names[] = new String[mGenotypeFields.size()];
int index = 0;
for (VCFGenotypeRecord rec : this.mGenotypeFields)
names[index++] = rec.getSampleName();
return names;
}
public String getGenotypeFormatString() {
return mGenotypeFormatString;
}// the formatting string for our genotype records
public void setReferenceBase(char referenceBase) {
// upppercase the character
referenceBase = (char) ((referenceBase > 96) ? referenceBase - 32 : referenceBase);
if (referenceBase != 'A' && referenceBase != 'C' && referenceBase != 'T' && referenceBase != 'G' && referenceBase != 'N')
throw new IllegalArgumentException("Reference base must be one of A,C,G,T,N, we saw: " + referenceBase);
this.mReferenceBase = referenceBase;
}
public void setPosition(int mPosition) {
if (mPosition < 0)
throw new IllegalArgumentException("Position values must be greater than 0");
this.mPosition = mPosition;
}
public void setChomosome(String mChrome) {
this.mChrome = mChrome;
}
public void setID(String mID) {
this.mID = mID;
}
public void setQual(int mQual) {
if (mQual < 0)
throw new IllegalArgumentException("Qual values must be greater than 0");
this.mQual = mQual;
}
public void setFilterString(String mFilterString) {
this.mFilterString = mFilterString;
}
public void addGenotypeFields(VCFGenotypeRecord mGenotypeFields) {
this.mGenotypeFields.add(mGenotypeFields);
}
public void addAlternateBase(String base) {
if (base.length() == 1) {
char nuc = (char) ((base.charAt(0) > 96) ? base.charAt(0) - 32 : base.charAt(0));
if (nuc != 'A' && nuc != 'C' && nuc != 'T' && nuc != 'G' && nuc != '.')
throw new IllegalArgumentException("Alternate base must be either A,C,T,G,. or if an indel it must contain length information: " + base);
} else {
// we must be an indel, check that the first character is I or D
char nuc = (char) ((base.charAt(0) > 96) ? base.charAt(0) - 32 : base.charAt(0));
if (nuc != 'I' && nuc != 'D')
throw new IllegalArgumentException("Alternate bases of length greater then one must be an indel: " + base);
}
this.mAlts.add(base);
}
public void addInfoField(String key, String value) {
this.mInfoFields.put(key, value);
}
public String toString() {
StringBuilder builder = new StringBuilder();
// else builder.append(FIELD_SEPERATOR + record.getValue(field));
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
builder.append(getChromosome() + FIELD_SEPERATOR);
builder.append(getPosition() + FIELD_SEPERATOR);
builder.append(getID() + FIELD_SEPERATOR);
builder.append(getReferenceBase() + FIELD_SEPERATOR);
String alts = "";
for (String str : this.getAlternateAlleles()) alts += str + ",";
builder.append(alts.substring(0, alts.length() - 1) + FIELD_SEPERATOR);
builder.append(getQual() + FIELD_SEPERATOR);
builder.append(Utils.join(";", getFilteringCodes()) + FIELD_SEPERATOR);
String info = "";
for (String str : this.getInfoValues().keySet()) {
if (str.equals("."))
info = ".";
else
info += str + "=" + getInfoValues().get(str) + ";";
}
if (info.length() > 1) builder.append(info.substring(0, info.length() - 1));
else builder.append(info);
if (this.hasGenotypeData()) {
builder.append(FIELD_SEPERATOR + this.getGenotypeFormatString());
for (VCFGenotypeRecord rec : this.getVCFGenotypeRecords()) {
builder.append(FIELD_SEPERATOR);
if (!rec.toGenotypeString(this.mAlts).equals(""))
builder.append(rec.toGenotypeString(this.mAlts));
for (String s : rec.getFields().keySet()) {
if (rec.getFields().get(s).equals("")) continue;
builder.append(":");
builder.append(rec.getFields().get(s));
}
}
}
return builder.toString();
}
/**
* compare two VCF records
*
* @param other the other VCF record
*
* @return true if they're equal
*/
public boolean equals(VCFRecord other) {
if (!this.mAlts.equals(other.mAlts)) return false;
if (this.mReferenceBase != other.mReferenceBase) return false;
if (!this.mChrome.equals(other.mChrome)) return false;
if (this.mPosition != other.mPosition) return false;
if (!this.mID.equals(other.mID)) return false;
if (this.mQual != other.mQual) return false;
if (!this.mFilterString.equals(other.mFilterString)) return false;
if (!this.mInfoFields.equals(other.mInfoFields)) return false;
if (!this.mGenotypeFields.equals(other.mGenotypeFields)) return false;
return true;
}
}

View File

@ -61,33 +61,13 @@ public class VCFWriter {
throw new RuntimeException("Record has " + record.getColumnCount() +
" columns, when is should have " + mHeader.getColumnCount());
}
StringBuilder builder = new StringBuilder();
String vcfString = record.toString();
try {
mWriter.write(vcfString + "\n");
} catch (IOException e) {
throw new RuntimeException("Unable to write the VCF object to a file");
}
// first output the required fields in order
boolean first = true;
for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields()) {
if (first) {
first = false;
builder.append(record.getValue(field));
} else builder.append(FIELD_SEPERATOR + record.getValue(field));
}
if (record.hasGenotypeData()) {
builder.append(FIELD_SEPERATOR + record.getFormatString());
for (VCFGenotypeRecord rec : record.getVCFGenotypeRecords()) {
builder.append(FIELD_SEPERATOR);
boolean ft = true;
for (String s : rec.getFields().keySet()) {
if (!ft) builder.append(":");
else ft = true;
builder.append(rec.getFields().get(s));
}
}
try {
mWriter.write(builder.toString() + "\n");
} catch (IOException e) {
throw new RuntimeException("Unable to write the VCF object to a file");
}
}
}
/** attempt to close the VCF file */

View File

@ -0,0 +1,179 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.junit.Assert;
import static org.junit.Assert.fail;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* @author aaron
* <p/>
* Class RodVCFTest
* <p/>
* test out the rod VCF
*/
public class RodVCFTest extends BaseTest {
private static IndexedFastaSequenceFile seq;
private static File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf");
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
private RodVCF getVCFObject() {
RodVCF vcf = new RodVCF("VCF");
VCFHeader header = null;
try {
header = (VCFHeader) vcf.initialize(vcfFile);
} catch (FileNotFoundException e) {
fail("Unable to open VCF file");
}
header.checkVCFVersion();
return vcf;
}
@Test
public void testInitialize() {
getVCFObject();
}
@Test
public void testToIterator() {
RodVCF vcf = getVCFObject();
VCFReader reader = new VCFReader(vcfFile);
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
while (iter.hasNext()) {
VCFRecord rec1 = reader.next();
VCFRecord rec2 = iter.next().mCurrentRecord;
if (!rec1.equals(rec2)) {
fail("VCF record rec1 != rec2");
}
}
}
@Test
public void testToMAF() {
RodVCF vcf = getVCFObject();
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
RodVCF rec = iter.next();
Assert.assertEquals(rec.getNonRefAlleleFrequency(), 0.786, 0.00001);
rec = iter.next();
rec = iter.next();
rec = iter.next();
Assert.assertEquals(rec.getNonRefAlleleFrequency(), 0.0, 0.00001);
}
@Test
public void testToString() {
// slightly altered line, due to map ordering
String firstLine = "20\t14370\trs6054257\tG\tA\t29\t0\tDP=258;AF=0.786;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n";
RodVCF vcf = getVCFObject();
VCFReader reader = new VCFReader(vcfFile);
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
boolean first = true;
while (iter.hasNext()) {
VCFRecord rec1 = reader.next();
VCFRecord rec2 = iter.next().mCurrentRecord;
if (!rec1.toString().equals(rec2.toString())) {
fail("VCF record rec1.toString() != rec2.toString()");
}
// verify the first line too
if (first) {
if (!firstLine.equals(rec1.toString() + "\n")) {
fail("VCF record rec1.toString() != expected string :\n" + rec1.toString() + firstLine);
}
first = false;
}
}
}
@Test
public void testType() {
RodVCF vcf = getVCFObject();
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
RodVCF rec = iter.next();
Assert.assertTrue(rec.isSNP());
rec = iter.next();
rec = iter.next();
rec = iter.next();
rec = iter.next();
Assert.assertTrue(rec.isIndel());
Assert.assertTrue(rec.isInsertion());
Assert.assertTrue(rec.isDeletion());
}
@Test
public void testGetGenotypes() {
RodVCF vcf = getVCFObject();
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
RodVCF rec = iter.next();
// we should get back a ref 'G' and an alt 'A'
List<Genotype> list = rec.getGenotypes();
List<String> knowngenotypes = new ArrayList<String>();
knowngenotypes.add("GG");
knowngenotypes.add("AG");
knowngenotypes.add("AA");
Assert.assertEquals(3, list.size());
for (Genotype g: list) {
Assert.assertTrue(knowngenotypes.contains(g.getBases()));
}
}
@Test
public void testGetGenotypesQual() {
RodVCF vcf = getVCFObject();
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
RodVCF rec = iter.next();
// we should get back a ref 'G' and an alt 'A'
List<Genotype> list = rec.getGenotypes();
Assert.assertEquals(4.8,list.get(0).getNegLog10PError(),0.0001);
Assert.assertEquals(4.8,list.get(1).getNegLog10PError(),0.0001);
Assert.assertEquals(4.3,list.get(2).getNegLog10PError(),0.0001);
}
@Test
public void testHasGenotypes() {
RodVCF vcf = getVCFObject();
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
RodVCF rec = iter.next();
Assert.assertTrue(rec.hasGenotype(DiploidGenotype.valueOf("GG")));
Assert.assertTrue(rec.hasGenotype(DiploidGenotype.valueOf("AG")));
Assert.assertTrue(rec.hasGenotype(DiploidGenotype.valueOf("AA")));
Assert.assertTrue(!rec.hasGenotype(DiploidGenotype.valueOf("TT")));
}
@Test
public void testGetLocation() {
RodVCF vcf = getVCFObject();
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);
RodVCF rec = iter.next();
GenomeLoc loc = rec.getLocation();
Assert.assertEquals(14370,loc.getStart());
Assert.assertEquals(14370,loc.getStop());
Assert.assertTrue(loc.getContig().equals("20"));
}
}

View File

@ -8,64 +8,63 @@ import java.util.Arrays;
public class VariantFiltrationIntegrationTest extends WalkerTest {
@Test
public void testIntervals() {
String[] md5DoC = {"5f176d27eba04c8e08b54b60fd96758e", "21c8e1f9dc65fdfb39347547f9b04011"};
String[] md5DoC = {"eada262e03738876a2b5b94d56f0951e", "21c8e1f9dc65fdfb39347547f9b04011"};
WalkerTestSpec spec1 = new WalkerTestSpec(
"-T VariantFiltration -X DepthOfCoverage:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5DoC));
executeTest("testDoCFilter", spec1);
String[] md5AlleleBalance = {"dfa47f9465214445b737ec49025d6b5f", "a13e4ce6260bf9f33ca99dc808b8e6ad"};
String[] md5AlleleBalance = {"0ade6c97a46245a65558d115ef1e3956", "a13e4ce6260bf9f33ca99dc808b8e6ad"};
WalkerTestSpec spec2 = new WalkerTestSpec(
"-T VariantFiltration -X AlleleBalance:low=0.25,high=0.75 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5AlleleBalance));
executeTest("testAlleleBalanceFilter", spec2);
String[] md5Strand = {"2d32ddd6daccd08f3df148f4d373fa5e", "0f7db0aad764268ee8fa3b857df8d87d"};
String[] md5Strand = {"1a9568e3f7e88483e7fbc5f981a7973c", "0f7db0aad764268ee8fa3b857df8d87d"};
WalkerTestSpec spec3 = new WalkerTestSpec(
"-T VariantFiltration -X FisherStrand:pvalue=0.0001 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5Strand));
executeTest("testStrandFilter", spec3);
String[] md5Lod = {"157e7beab140f4252ed41ebfc9a508f2", "7e0c4f2b0fda85fd2891eee76c396a55"};
String[] md5Lod = {"64bc742d0f44a7279ac6c4ad5c28da28", "7e0c4f2b0fda85fd2891eee76c396a55"};
WalkerTestSpec spec4 = new WalkerTestSpec(
"-T VariantFiltration -X LodThreshold:lod=10 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5Lod));
executeTest("testLodFilter", spec4);
String[] md5MQ0 = {"15738f3717dfa3192566a7ddf301fe11", "3203de335621851bccf596242b079e23"};
String[] md5MQ0 = {"1201176efca694e2566a9de1508517b8", "3203de335621851bccf596242b079e23"};
WalkerTestSpec spec5 = new WalkerTestSpec(
"-T VariantFiltration -X MappingQualityZero:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5MQ0));
executeTest("testMappingQuality0Filter", spec5);
String[] md5MQ = {"e7fb1f166d2359f653f98f8215215aca", "ecc777feedea61f7b570d114c2ab89b1"};
String[] md5MQ = {"ed922e58c856e1bc3f125b635dea48fc", "ecc777feedea61f7b570d114c2ab89b1"};
WalkerTestSpec spec6 = new WalkerTestSpec(
"-T VariantFiltration -X MappingQuality:min=20 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5MQ));
executeTest("testRMSMappingQualityFilter", spec6);
String[] md5OnOff = {"fe4acc1505f19480f275d7a96b0e3ddb", "67f2e1bc025833b0fa31f47195198997"};
String[] md5OnOff = {"8a00c5ad80c43e468186ce0147553048", "67f2e1bc025833b0fa31f47195198997"};
WalkerTestSpec spec7 = new WalkerTestSpec(
"-T VariantFiltration -X OnOffGenotypeRatio:threshold=0.9 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5OnOff));
executeTest("testOnOffGenotypeFilter", spec7);
String[] md5Clusters = {"f5c7c5da0198c4aaafdfcfb3c356eedc", "8fa6b6ffc93ee7fb8d6b52a7fb7815ef"};
String[] md5Clusters = {"ee9d8039490354cc7f2c574b5d10c7d8", "8fa6b6ffc93ee7fb8d6b52a7fb7815ef"};
WalkerTestSpec spec8 = new WalkerTestSpec(
"-T VariantFiltration -X ClusteredSnps:window=10,snps=3 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5Clusters));
executeTest("testClusteredSnpsFilter", spec8);
String[] md5Indels = {"30bf4c764f6dfd006d919ecaceee0166", "8e0e915a1cb63d7049e0671ed00101fe"};
String[] md5Indels = {"90eb46f8d2bf4c0a72716e35c914839d", "8e0e915a1cb63d7049e0671ed00101fe"};
WalkerTestSpec spec9 = new WalkerTestSpec(
"-T VariantFiltration -X IndelArtifact -B indels,PointIndel,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.indels -B cleaned,CleanedOutSNP,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.realigner_badsnps -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,

View File

@ -41,11 +41,10 @@ public class VCFReaderTest extends BaseTest {
char referenceBase = 'N';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals(referenceBase,rec.getReference());
Assert.assertEquals("N",rec.getAllele().get(0));
Assert.assertEquals("A",rec.getAllele().get(1));
Assert.assertEquals("N",rec.getAlleles().get(0));
Assert.assertEquals("A",rec.getAlleles().get(1));
Map<String,String> values = rec.getFields();
Assert.assertEquals(4,values.size());
Assert.assertEquals(3,values.size());
Assert.assertTrue(values.get("B").equals("2"));
Assert.assertTrue(values.get("C").equals("3"));
Assert.assertTrue(values.get("D").equals("4"));
@ -63,11 +62,10 @@ public class VCFReaderTest extends BaseTest {
char referenceBase = 'N';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals(referenceBase,rec.getReference());
Assert.assertEquals("N",rec.getAllele().get(0));
Assert.assertEquals("A",rec.getAllele().get(1));
Assert.assertEquals("N",rec.getAlleles().get(0));
Assert.assertEquals("A",rec.getAlleles().get(1));
Map<String,String> values = rec.getFields();
Assert.assertEquals(4,values.size());
Assert.assertEquals(3,values.size());
Assert.assertTrue(values.get("B").equals(""));
Assert.assertTrue(values.get("C").equals(""));
Assert.assertTrue(values.get("D").equals("4"));
@ -84,11 +82,10 @@ public class VCFReaderTest extends BaseTest {
char referenceBase = 'N';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals(referenceBase,rec.getReference());
Assert.assertEquals("N",rec.getAllele().get(0));
Assert.assertEquals("A",rec.getAllele().get(1));
Assert.assertEquals("N",rec.getAlleles().get(0));
Assert.assertEquals("A",rec.getAlleles().get(1));
Map<String,String> values = rec.getFields();
Assert.assertEquals(4,values.size());
Assert.assertEquals(3,values.size());
Assert.assertTrue(values.get("B").equals(""));
Assert.assertTrue(values.get("C").equals(""));
Assert.assertTrue(values.get("D").equals(""));

View File

@ -0,0 +1,46 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest;
import org.junit.Assert;
import org.junit.Test;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
*
* @author aaron
*
* Class VCFRecordTest
*
* test the basic functionality of the vcf record
*/
public class VCFRecordTest extends BaseTest {
private VCFRecord makeFakeVCFRecord() {
List<String> altBases = new ArrayList<String>();
altBases.add("C");
altBases.add("D1");
Map<String,String> infoFields = new HashMap<String,String>();
infoFields.put("DP","50");
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>();
Map<String, String> keyValues = new HashMap<String,String>();
keyValues.put("AA","2");
List<String> Alleles = new ArrayList<String>();
Alleles.add("A");
genotypeObjects.add(new VCFGenotypeRecord("SampleName", Alleles, VCFGenotypeRecord.PHASE.PHASED, keyValues));
return new VCFRecord('A',"chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",genotypeObjects);
}
@Test
public void testGetGenotypes() {
VCFRecord rec = makeFakeVCFRecord();
List<VCFGenotypeRecord> genotypeObjects = rec.getVCFGenotypeRecords();
Assert.assertEquals(1,genotypeObjects.size());
Assert.assertTrue(genotypeObjects.get(0).getSampleName().equals("SampleName"));
}
}

View File

@ -60,18 +60,24 @@ public class VCFWriterTest extends BaseTest {
* @return a VCFRecord
*/
private VCFRecord createVCFRecord(VCFHeader header) {
Map<VCFHeader.HEADER_FIELDS,String> map = new HashMap<VCFHeader.HEADER_FIELDS,String>();
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values())
map.put(field,String.valueOf(1));
List<String> altBases = new ArrayList<String>();
altBases.add("C");
altBases.add("D1");
Map<String,String> infoFields = new HashMap<String,String>();
infoFields.put("DP","50");
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
for (String name : header.getGenotypeSamples()) {
Map<String,String> str = new HashMap<String,String>();
str.put("key","0|0");
List<String> alleles = new ArrayList<String>();
alleles.add("AAA");
gt.add(new VCFGenotypeRecord(name,str,alleles, VCFGenotypeRecord.PHASE.PHASED,'A'));
str.put("bb","0");
List<String> myAlleles = new ArrayList<String>();
myAlleles.add("C");
myAlleles.add("D1");
gt.add(new VCFGenotypeRecord(name, myAlleles, VCFGenotypeRecord.PHASE.PHASED, str));
}
return new VCFRecord(header,map,"GT",gt);
return new VCFRecord('A',"chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",gt);
}