From 7b39aa496638e9b89749ba2ce3fd94c285ef8690 Mon Sep 17 00:00:00 2001 From: aaron Date: Fri, 18 Sep 2009 20:19:34 +0000 Subject: [PATCH] Adding the VCF ROD. Also changed the VCF objects to much more user friendly. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1658 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/RodGLF.java | 37 +- .../sting/gatk/refdata/RodGeliText.java | 50 ++- .../gatk/refdata/RodGenotypeChipAsGFF.java | 28 +- .../sting/gatk/refdata/RodVCF.java | 329 +++++++++++++++++ .../sting/gatk/refdata/rodDbSNP.java | 190 ++++++---- .../filters/VariantFiltrationWalker.java | 2 +- .../varianteval/GenotypeConcordance.java | 8 +- .../varianteval/IndelMetricsAnalysis.java | 8 +- .../PooledGenotypeConcordance.java | 16 +- .../walkers/varianteval/VariantCounter.java | 2 +- .../varianteval/VariantDBCoverage.java | 4 +- .../walkers/variantstovcf/VariantsToVCF.java | 8 +- .../sting/utils/genotype/BasicVariation.java | 38 +- .../genotype/VariantBackedByGenotype.java | 13 +- .../sting/utils/genotype/Variation.java | 25 +- .../utils/genotype/vcf/VCFGenotypeRecord.java | 77 ++-- .../sting/utils/genotype/vcf/VCFReader.java | 14 +- .../sting/utils/genotype/vcf/VCFRecord.java | 333 ++++++++++++++---- .../sting/utils/genotype/vcf/VCFWriter.java | 32 +- .../sting/gatk/refdata/RodVCFTest.java | 179 ++++++++++ .../VariantFiltrationIntegrationTest.java | 19 +- .../utils/genotype/vcf/VCFReaderTest.java | 21 +- .../utils/genotype/vcf/VCFRecordTest.java | 46 +++ .../utils/genotype/vcf/VCFWriterTest.java | 22 +- 24 files changed, 1206 insertions(+), 295 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java create mode 100755 java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java create mode 100755 java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java index b3868242a..4977b195e 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java @@ -15,6 +15,8 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Iterator; +import java.util.List; +import java.util.ArrayList; /** @@ -123,6 +125,12 @@ public class RodGLF implements ReferenceOrderedDatum, Variation, Iterator methods (where Variant=SNP,Insertion, etc) should * return false at such site to ensure consistency. This method is included for use with genotyping calls (isGenotype()==true), it makes @@ -154,9 +162,9 @@ public class RodGLF implements ReferenceOrderedDatum, Variation, Iterator getAlternateBases() { + List list = new ArrayList(); + list.add(this.getAlternateBase()); + return list; + } + /** * Returns minor allele frequency. * @@ -270,8 +290,7 @@ public class RodGLF implements ReferenceOrderedDatum, Variation, Iterator getAlternateBases() { + List list = new ArrayList(); + list.add(this.getAlternateBase()); + return list; + } + public boolean isIndel() { return false; } @@ -306,7 +319,7 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation if (genotypeLikelihoods[likelihoodIndex] > bestLikelihood) { bestLikelihood = genotypeLikelihoods[likelihoodIndex]; - bestGenotype = Genotype.values()[likelihoodIndex].toString(); + bestGenotype = Genotype_Strings.values()[likelihoodIndex].toString(); } } @@ -317,8 +330,8 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation } for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) { - if (refBase == Genotype.values()[likelihoodIndex].toString().charAt(0) && - refBase == Genotype.values()[likelihoodIndex].toString().charAt(1)) { + if (refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(0) && + refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(1)) { refLikelihood = genotypeLikelihoods[likelihoodIndex]; } } @@ -328,19 +341,28 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation this.lodBtnb = (bestLikelihood - nextBestLikelihood); } + /** * get the likelihoods * * @return an array in lexigraphical order of the likelihoods */ @Override - public org.broadinstitute.sting.utils.genotype.Genotype getGenotype(DiploidGenotype x) { - // figure out the best -switch to this - /*Integer values[] = Utils.SortPermutation(this.genotypeLikelihoods); - int indexThis = x.ordinal(); - int other = (x.toString().equals(Genotype.values()[values[1]])) ? values[9] : values[8]; - double lod = (genotypeLikelihoods[indexThis] - genotypeLikelihoods[other]);*/ - return new BasicGenotype(getLocation(),x.toString(), refBase, lodBtnb); + public Genotype getGenotype(DiploidGenotype x) { + if (x.toString() != this.getAltBasesFWD()) throw new IllegalStateException("We don't contain that genotype!"); + return new BasicGenotype(getLocation(), x.toString(), refBase, lodBtnb); + } + + /** + * get the likelihoods + * + * @return an array in lexigraphical order of the likelihoods + */ + @Override + public List getGenotypes() { + List ret = new ArrayList(); + ret.add(new BasicGenotype(getLocation(), this.getAltBasesFWD(), refBase, lodBtnb)); + return ret; } /** diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodGenotypeChipAsGFF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodGenotypeChipAsGFF.java index 0f0ad103c..ff0189587 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodGenotypeChipAsGFF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodGenotypeChipAsGFF.java @@ -187,10 +187,22 @@ public class RodGenotypeChipAsGFF extends BasicReferenceOrderedDatum implements * @return */ @Override - public String getAlternateBases() { + public String getAlternateBase() { return this.feature; } + /** + * gets the alternate bases. Use this method if teh allele count is greater then 2 + * + * @return + */ + @Override + public List getAlternateBases() { + List list = new ArrayList(); + list.add(this.getAlternateBase()); + return list; + } + /** * get the frequency of this variant * @@ -247,6 +259,18 @@ public class RodGenotypeChipAsGFF extends BasicReferenceOrderedDatum implements public boolean isBiallelic() { return true; } public int length() { return 1; } + /** + * get the likelihoods + * + * @return an array in lexigraphical order of the likelihoods + */ + @Override + public List getGenotypes() { + List ret = new ArrayList(); + ret.add(new BasicGenotype(this.getLocation(),this.feature,this.getRefSnpFWD(),this.getConsensusConfidence())); + return ret; + } + /** * get the likelihoods * @@ -257,7 +281,7 @@ public class RodGenotypeChipAsGFF extends BasicReferenceOrderedDatum implements if (!x.toString().equals(this.getAltBasesFWD())) throw new IllegalStateException("Unable to retrieve genotype"); return new BasicGenotype(this.getLocation(),this.feature,this.getRefSnpFWD(),this.getConsensusConfidence()); } - + /** * do we have the specified genotype? not all backedByGenotypes * have all the genotype data. diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java new file mode 100755 index 000000000..74622e77f --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java @@ -0,0 +1,329 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; +import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + + +/** + * @author aaron + *

+ * Class RodVCF + *

+ * An implementation of the ROD for VCF. + */ +public class RodVCF extends BasicReferenceOrderedDatum implements Variation, VariantBackedByGenotype, Iterator { + // our VCF related information + private VCFReader mReader; + public VCFRecord mCurrentRecord; + + public RodVCF(String name) { + super(name); + } + + public void assertNotNull() { + if (mCurrentRecord == null) { + throw new UnsupportedOperationException("The current Record is null"); + } + } + + @Override + public boolean parseLine(Object header, String[] parts) throws IOException { + throw new UnsupportedOperationException("We don't support the parse line"); + } + + public Object initialize(final File source) throws FileNotFoundException { + if (mReader == null) { + mReader = new VCFReader(source); + } + return mReader.getHeader(); + } + + @Override + public String toString() { + if (this.mCurrentRecord != null) + return this.mCurrentRecord.toString(); + else + return ""; + } + + public Iterator createIterator(String name, File file) { + RodVCF vcf = new RodVCF(name); + try { + vcf.initialize(file); + } catch (FileNotFoundException e) { + throw new StingException("Unable to find file " + file); + } + return vcf; + } + + /** + * get the frequency of this variant + * + * @return VariantFrequency with the stored frequency + */ + @Override + public double getNonRefAlleleFrequency() { + assertNotNull(); + if (this.mCurrentRecord.getInfoValues().containsKey("AF")) { + return Double.valueOf(this.mCurrentRecord.getInfoValues().get("AF")); + } else { + // this is the poor man's AF + if (this.mCurrentRecord.getInfoValues().containsKey("AC") && this.mCurrentRecord.getInfoValues().containsKey("AN")) { + String splt[] = mCurrentRecord.getInfoValues().get("AC").split(","); + if (splt.length > 0) { + return (Double.valueOf(splt[0]) / Double.valueOf(mCurrentRecord.getInfoValues().get("AN"))); + } + } + } + + return 0.0; + } + + /** @return the VARIANT_TYPE of the current variant */ + @Override + public VARIANT_TYPE getType() { + if (this.isSNP()) return VARIANT_TYPE.SNP; + else if (this.isIndel()) return VARIANT_TYPE.INDEL; + return VARIANT_TYPE.REFERENCE; + } + + /** + * are we a SNP? If not we're a Indel/deletion + * + * @return true if we're a SNP + */ + @Override + public boolean isSNP() { + this.assertNotNull(); + if (!mCurrentRecord.hasAlternateAllele()) + return false; + for (String alt : this.mCurrentRecord.getAlternateAlleles()) { + if (alt.length() != 1) + return false; + } + return true; + } + + /** + * are we an insertion? + * + * @return true if we are, false otherwise + */ + @Override + public boolean isInsertion() { + this.assertNotNull(); + if (!mCurrentRecord.hasAlternateAllele()) + return false; + for (String alt : this.mCurrentRecord.getAlternateAlleles()) { + if (alt.startsWith("I")) + return true; + } + return false; + } + + /** + * are we an insertion? + * + * @return true if we are, false otherwise + */ + @Override + public boolean isDeletion() { + this.assertNotNull(); + if (!mCurrentRecord.hasAlternateAllele()) + return false; + for (String alt : this.mCurrentRecord.getAlternateAlleles()) { + if (alt.startsWith("D")) + return true; + } + return false; + } + + @Override + public GenomeLoc getLocation() { + this.assertNotNull(); + return GenomeLocParser.createGenomeLoc(mCurrentRecord.getChromosome(), mCurrentRecord.getPosition()); + } + + /** + * get the reference base(s) at this position + * + * @return the reference base or bases, as a string + */ + @Override + public String getReference() { + return String.valueOf(mCurrentRecord.getReferenceBase()); + } + + /** are we bi-allelic? */ + @Override + public boolean isBiallelic() { + return (this.getAlternateBases().size() == 1); + } + + /** + * get the -1 * (log 10 of the error value) + * + * @return the log based error estimate + */ + @Override + public double getNegLog10PError() { + // we're -10 log(error), we have to divide by 10 + return mCurrentRecord.getQual() / 10.0; + } + + /** + * are we truely a variant, given a reference + * + * @return false if we're a variant(indel, delete, SNP, etc), true if we're not + */ + @Override + public boolean isReference() { + return (!mCurrentRecord.hasAlternateAllele()); + } + + /** + * gets the alternate bases. If this is homref, throws an UnsupportedOperationException + * + * @return + */ + @Override + public String getAlternateBase() { + if (!this.isBiallelic()) throw new UnsupportedOperationException("We're not biallelic, so please call getAlternateBases instead"); + return this.mCurrentRecord.getAlternateAlleles().get(0); + } + + /** + * gets the alternate bases. If this is homref, throws an UnsupportedOperationException + * + * @return + */ + @Override + public List getAlternateBases() { + return this.mCurrentRecord.getAlternateAlleles(); + } + + /** + * are we an insertion or a deletion? yes, then return true. No? Well, false then. + * + * @return true if we're an insertion or deletion + */ + @Override + public boolean isIndel() { + return (!isSNP()); + } + + /** + * gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP + * of + * + * @return a char, representing the alternate base + */ + @Override + public char getAlternativeBaseForSNP() { + if (!isSNP()) throw new IllegalStateException("we're not a SNP"); + return mCurrentRecord.getAlternateAlleles().get(0).charAt(0); + } + + /** + * gets the reference base is the case of a SNP. Throws an IllegalStateException if we're not a SNP + * + * @return a char, representing the alternate base + */ + @Override + public char getReferenceForSNP() { + if (!isSNP()) throw new IllegalStateException("we're not a SNP"); + return mCurrentRecord.getReferenceBase(); + } + + /** + * get the genotypes + * + * @return a list of the genotypes + */ + @Override + public List getGenotypes() { + List genotypes = new ArrayList(); + if (!this.mCurrentRecord.hasGenotypeData()) { + return genotypes; + } + double refQual = (this.getNegLog10PError()); + // add the reference + for (VCFGenotypeRecord rec : mCurrentRecord.getVCFGenotypeRecords()) { + double qual = 0; + if (rec.getAlleles().equals(this.getReference())) + qual = refQual; + else if (rec.getFields().containsKey("GQ")) + qual = Double.valueOf(rec.getFields().get("GQ")) / 10.0; + genotypes.add(new BasicGenotype(this.getLocation(), Utils.join("", rec.getAlleles()), this.getReference().charAt(0), qual)); + } + return genotypes; + } + + + /** + * get the likelihoods + * + * @return an array in lexigraphical order of the likelihoods + */ + @Override + public Genotype getGenotype(DiploidGenotype x) { + if (x.toString().equals(getReference())) + return new BasicGenotype(this.getLocation(), getReference(), this.getReference().charAt(0), 0); + for (VCFGenotypeRecord record : mCurrentRecord.getVCFGenotypeRecords()) { + if (Utils.join("", record.getAlleles()).equals(x.toString())) { + double qual = 0.0; + if (record.getAlleles().equals(this.getReference())) + qual = this.getNegLog10PError(); + else if (record.getFields().containsKey("GQ")) + qual = Double.valueOf(record.getFields().get("GQ")) / 10.0; + return new BasicGenotype(this.getLocation(), Utils.join("", record.getAlleles()), this.getReference().charAt(0), qual); + } + } + return null; + } + + /** + * do we have the specified genotype? not all backedByGenotypes + * have all the genotype data. + * + * @param x the genotype + * + * @return true if available, false otherwise + */ + @Override + public boolean hasGenotype(DiploidGenotype x) { + if (getGenotype(x) != null) + return true; + return false; + } + + @Override + public boolean hasNext() { + return (mReader.hasNext()); + } + + @Override + public RodVCF next() { + mCurrentRecord = mReader.next(); + return this; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("You cannot remove from a VCF rod"); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java index 16fb7123b..37200a9c0 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java @@ -2,17 +2,20 @@ package org.broadinstitute.sting.gatk.refdata; import net.sf.picard.util.SequenceUtil; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.utils.genotype.BasicGenotype; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; +import org.broadinstitute.sting.utils.genotype.Variation; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Example format: - * 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1 - * 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1 - * + * 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1 + * 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1 + *

* User: mdepristo * Date: Feb 27, 2009 * Time: 10:47:14 AM @@ -60,7 +63,9 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V // manipulating the SNP information // // ---------------------------------------------------------------------- - public GenomeLoc getLocation() { return loc; } + public GenomeLoc getLocation() { + return loc; + } /** * get the reference base(s) at this position @@ -86,7 +91,8 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V return strand.equals("+"); } - /** Returns bases in the reference allele as a String. String can be empty (as in insertion into + /** + * Returns bases in the reference allele as a String. String can be empty (as in insertion into * the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters * (for longer indels). * @@ -109,18 +115,18 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V */ public char getRefSnpFWD() throws IllegalStateException { //System.out.printf("refbases is %s but %s%n", refBases, toString()); - if ( isIndel() ) throw new IllegalStateException("Variant is not a SNP"); + if (isIndel()) throw new IllegalStateException("Variant is not a SNP"); // fix - at least this way we ensure that we'll get the other base compared to getAltBasesFWD() List alleles = getAllelesFWD(); String val = (alleles.get(0).equals(refBases) ? alleles.get(0) : alleles.get(1)); return val.charAt(0); // if ( onFwdStrand() ) return refBases.charAt(0); - // else return SequenceUtil.reverseComplement(refBases).charAt(0); + // else return SequenceUtil.reverseComplement(refBases).charAt(0); } public List getAllelesFWD() { List alleles = null; - if ( onFwdStrand() ) + if (onFwdStrand()) alleles = Arrays.asList(observed.split("/")); else alleles = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/")); @@ -148,13 +154,22 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V public VARIANT_TYPE getType() { return VARIANT_TYPE.SNP; }// ---------------------------------------------------------------------- + // // What kind of variant are we? // // ---------------------------------------------------------------------- - public boolean isSNP() { return varType.contains("single"); } - public boolean isInsertion() { return varType.contains("insertion"); } - public boolean isDeletion() { return varType.contains("deletion"); } + public boolean isSNP() { + return varType.contains("single"); + } + + public boolean isInsertion() { + return varType.contains("insertion"); + } + + public boolean isDeletion() { + return varType.contains("deletion"); + } /** * get the base representation of this Variant @@ -162,11 +177,25 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V * @return a string, of ploidy */ @Override - public String getAlternateBases() { + public String getAlternateBase() { return getAllelesFWDString(); } - public boolean isIndel() { return isInsertion() || isDeletion() || varType.contains("in-del"); } + /** + * gets the alternate bases. Use this method if teh allele count is greater then 2 + * + * @return + */ + @Override + public List getAlternateBases() { + List list = new ArrayList(); + list.add(this.getAlternateBase()); + return list; + } + + public boolean isIndel() { + return isInsertion() || isDeletion() || varType.contains("in-del"); + } /** * gets the alternate base is the case of a SNP. Throws an IllegalStateException in the case @@ -178,9 +207,9 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V public char getAlternativeBaseForSNP() { return getAltSnpFWD(); /* if (!this.isSNP()) throw new IllegalStateException("we're not a SNP"); - if (getAlternateBases().charAt(0) == this.getReference()) - return getAlternateBases().charAt(1); - return getAlternateBases().charAt(0); */ + if (getAlternateBase().charAt(0) == this.getReference()) + return getAlternateBase().charAt(1); + return getAlternateBase().charAt(0); */ } /** @@ -193,10 +222,17 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V return 0; //To change body of implemented methods use File | Settings | File Templates. } - public boolean isReference() { return false; } // snp locations are never "reference", there's always a variant + public boolean isReference() { + return false; + } // snp locations are never "reference", there's always a variant - public boolean isHapmap() { return validationStatus.contains("by-hapmap"); } - public boolean is2Hit2Allele() { return validationStatus.contains("by-2hit-2allele"); } + public boolean isHapmap() { + return validationStatus.contains("by-hapmap"); + } + + public boolean is2Hit2Allele() { + return validationStatus.contains("by-2hit-2allele"); + } // ---------------------------------------------------------------------- // @@ -205,9 +241,9 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V // ---------------------------------------------------------------------- public String toString() { return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", - getLocation().getContig(), getLocation().getStart(), getLocation().getStop()+1, - name, strand, refBases, observed, molType, - varType, validationStatus, avHet, avHetSE, func, locType, weight ); + getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1, + name, strand, refBases, observed, molType, + varType, validationStatus, avHet, avHetSE, func, locType, weight); } public String toSimpleString() { @@ -216,18 +252,18 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V public String toMediumString() { String s = String.format("%s:%s:%s", getLocation().toString(), name, getAllelesFWDString()); - if ( isSNP() ) s += ":SNP"; - if ( isIndel() ) s += ":Indel"; - if ( isHapmap() ) s += ":Hapmap"; - if ( is2Hit2Allele() ) s += ":2Hit"; + if (isSNP()) s += ":SNP"; + if (isIndel()) s += ":Indel"; + if (isHapmap()) s += ":Hapmap"; + if (is2Hit2Allele()) s += ":2Hit"; return s; } public String repl() { return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", - 585, getLocation().getContig(), getLocation().getStart()-1, getLocation().getStop(), - name, strand, refBases, refBases, observed, molType, - varType, validationStatus, avHet, avHetSE, func, locType, weight ); + 585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(), + name, strand, refBases, refBases, observed, molType, + varType, validationStatus, avHet, avHetSE, func, locType, weight); } public boolean parseLine(final Object header, final String[] parts) { @@ -235,12 +271,12 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V String contig = parts[1]; long start = Long.parseLong(parts[2]) + 1; // The final is 0 based long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based - loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop-1)); + loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1)); name = parts[4]; strand = parts[6]; refBases = parts[7]; - if ( strand == "-" ) + if (strand == "-") refBases = BaseUtils.simpleReverseComplement(refBases); observed = parts[9]; molType = parts[10]; @@ -253,69 +289,83 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V weight = Integer.parseInt(parts[17]); //System.out.printf("Parsed %s%n", toString()); return true; - } catch( MalformedGenomeLocException ex ) { + } catch (MalformedGenomeLocException ex) { // Just rethrow malformed genome locs; the ROD system itself will deal with these. throw ex; - } catch( ArrayIndexOutOfBoundsException ex ) { + } catch (ArrayIndexOutOfBoundsException ex) { // Just rethrow malformed genome locs; the ROD system itself will deal with these. throw new RuntimeException("Badly formed dbSNP line: " + ex); - } catch ( RuntimeException e ) { + } catch (RuntimeException e) { System.out.printf(" Exception caught during parsing DBSNP line %s%n", Utils.join(" <=> ", parts)); throw e; } } - public String getAltBasesFWD() { + public String getAltBasesFWD() { List alleles = getAllelesFWD(); return (alleles.get(0).equals(refBases) ? alleles.get(1) : alleles.get(0)); - } + } - public char getAltSnpFWD() throws IllegalStateException { - if ( !isSNP() ) + public char getAltSnpFWD() throws IllegalStateException { + if (!isSNP()) throw new IllegalStateException("I'm not a SNP"); return getAltBasesFWD().charAt(0); - } + } - public double getConsensusConfidence() { - // TODO Auto-generated method stub - return Double.MAX_VALUE; - } + public double getConsensusConfidence() { + // TODO Auto-generated method stub + return Double.MAX_VALUE; + } - public List getGenotype() throws IllegalStateException { - return Arrays.asList(Utils.join("", getAllelesFWD())); - } + public List getGenotype() throws IllegalStateException { + return Arrays.asList(Utils.join("", getAllelesFWD())); + } - public double getMAF() { - // Fixme: update to actually get MAF - //return avHet; + public double getMAF() { + // Fixme: update to actually get MAF + //return avHet; return -1; - } + } public double getHeterozygosity() { return avHet; } - public int getPloidy() throws IllegalStateException { - // TODO Auto-generated method stub - return 0; - } + public int getPloidy() throws IllegalStateException { + // TODO Auto-generated method stub + return 0; + } - public double getVariationConfidence() { - // TODO Auto-generated method stub - return Double.MAX_VALUE; - } + public double getVariationConfidence() { + // TODO Auto-generated method stub + return Double.MAX_VALUE; + } - public boolean isGenotype() { - // TODO Auto-generated method stub - return false; - } + public boolean isGenotype() { + // TODO Auto-generated method stub + return false; + } - public boolean isBiallelic() { - // TODO Auto-generated method stub - return observed.indexOf('/')==observed.lastIndexOf('/'); - } + public boolean isBiallelic() { + // TODO Auto-generated method stub + return observed.indexOf('/') == observed.lastIndexOf('/'); + } - public int length() { return (int)(loc.getStop() - loc.getStart() + 1); } + public int length() { + return (int) (loc.getStop() - loc.getStart() + 1); + } + + /** + * get the likelihoods + * + * @return an array in lexigraphical order of the likelihoods + */ + @Override + public List getGenotypes() { + List list = new ArrayList(); + list.add(new BasicGenotype(this.getLocation(), this.getAltBasesFWD(), this.getRefSnpFWD(), this.getConsensusConfidence())); + return list; + } /** * get the likelihoods @@ -325,7 +375,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V @Override public org.broadinstitute.sting.utils.genotype.Genotype getGenotype(DiploidGenotype x) { if (!x.toString().equals(this.getAltBasesFWD())) throw new IllegalStateException("Unable to retrieve genotype"); - return new BasicGenotype(this.getLocation(),this.getAltBasesFWD(),this.getRefSnpFWD(),this.getConsensusConfidence()); + return new BasicGenotype(this.getLocation(), this.getAltBasesFWD(), this.getRefSnpFWD(), this.getConsensusConfidence()); } /** @@ -338,6 +388,6 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, V */ @Override public boolean hasGenotype(DiploidGenotype x) { - return (!x.toString().equals(this.getAltBasesFWD())) ? false : true; + return (!x.toString().equals(this.getAltBasesFWD())) ? false : true; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 8f93b80e5..142fdc94f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -318,7 +318,7 @@ public class VariantFiltrationWalker extends LocusWalker { if ( VariantsToVCF.generateVCFRecord(context.getTracker(), context.getReferenceContext(), context.getAlignmentContext(true), vcfHeader, gt, map, sampleNames, out, false, false) ) { if ( !filterFailureString.equals("") ) map.put(VCFHeader.HEADER_FIELDS.FILTER, filterFailureString); - vcfWriter.addRecord(new VCFRecord(vcfHeader, map, "GT:GQ:DP", gt)); + vcfWriter.addRecord(new VCFRecord(map, "GT:GQ:DP", gt)); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/GenotypeConcordance.java index 0dfaae88e..6bfed0d05 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/GenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/GenotypeConcordance.java @@ -58,18 +58,18 @@ public class GenotypeConcordance extends BasicVariantAnalysis implements Genotyp int truthIndex, callIndex; if (chip == null) truthIndex = UNKNOWN; - else if (chip.getAlternateBases().equals(g.toString())) + else if (chip.getAlternateBase().equals(g.toString())) truthIndex = REF; - else if (chip.getAlternateBases().charAt(0) != chip.getAlternateBases().charAt(1)) + else if (chip.getAlternateBase().charAt(0) != chip.getAlternateBase().charAt(1)) truthIndex = VAR_HET; else truthIndex = VAR_HOM; if (eval == null) callIndex = NO_CALL; - else if (eval.getAlternateBases().equals(g.toString())) + else if (eval.getAlternateBase().equals(g.toString())) callIndex = REF; - else if (eval.getAlternateBases().charAt(0) != eval.getAlternateBases().charAt(1)) + else if (eval.getAlternateBase().charAt(0) != eval.getAlternateBase().charAt(1)) callIndex = VAR_HET; else callIndex = VAR_HOM; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/IndelMetricsAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/IndelMetricsAnalysis.java index 6878d91fa..d8eb506da 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/IndelMetricsAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/IndelMetricsAnalysis.java @@ -38,10 +38,10 @@ public class IndelMetricsAnalysis extends BasicVariantAnalysis implements Genoty else throw new RuntimeException("Variation is indel, but isn't insertion or deletion!"); - if ( eval.getAlternateBases().length() < 100 ) { - sizes[eval.isDeletion() ? 0 : 1][eval.getAlternateBases().length()]++; - if ( eval.getAlternateBases().length() > maxSize ) - maxSize = eval.getAlternateBases().length(); + if ( eval.getAlternateBase().length() < 100 ) { + sizes[eval.isDeletion() ? 0 : 1][eval.getAlternateBase().length()]++; + if ( eval.getAlternateBase().length() > maxSize ) + maxSize = eval.getAlternateBase().length(); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java index b0237a8ec..e5def9348 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java @@ -5,8 +5,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.Variation; -import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import java.io.*; import java.util.*; @@ -185,7 +183,7 @@ class PooledConcordanceTable { public boolean pooledCallIsRef(Variation eval, char ref) { // code broken out for easy alteration when we start using pool-specific variations - return eval.getAlternateBases().equalsIgnoreCase((Utils.dupString(ref,2))); + return eval.getAlternateBase().equalsIgnoreCase((Utils.dupString(ref,2))); } public int calculateNumFrequencyIndeces(int poolSize) { @@ -209,7 +207,7 @@ class PooledConcordanceTable { for ( Variation eval : evals ) { if ( mismatchingCalls(firstEval, eval, ref) ) { // todo -- make this not a StingException but go to the log - throw new StingException("Tri-Allelic Position "+eval.getAlternateBases()+"/"+firstEval.getAlternateBases() + " Ref: "+ ref + " not supported"); + throw new StingException("Tri-Allelic Position "+eval.getAlternateBase()+"/"+firstEval.getAlternateBase() + " Ref: "+ ref + " not supported"); } else { alternateFrequency += calledVariantFrequency(eval,ref); } @@ -233,10 +231,10 @@ class PooledConcordanceTable { public boolean mismatchingCalls(Variation eval, Variation chip, char ref) { // eval and chip guaranteed to be non-null - char chipF = chip.getAlternateBases().charAt(0); - char chipS = chip.getAlternateBases().charAt(1); - char evalF = chip.getAlternateBases().charAt(0); - char evalS = chip.getAlternateBases().charAt(1); + char chipF = chip.getAlternateBase().charAt(0); + char chipS = chip.getAlternateBase().charAt(1); + char evalF = chip.getAlternateBase().charAt(0); + char evalS = chip.getAlternateBase().charAt(1); boolean mismatch; if (chipF == ref) { if ( chipS == ref ) { @@ -261,7 +259,7 @@ class PooledConcordanceTable { public double calledVariantFrequency( Variation var, char ref ) { // code broken out for easy alteration when we start using pool-specific variations - String varStr = var.getAlternateBases(); + String varStr = var.getAlternateBase(); double freq; if ( varStr.charAt(0) != ref && varStr.charAt(1) != ref ) { freq = (double) 2; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java index ae1fd211c..ca182da84 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantCounter.java @@ -31,7 +31,7 @@ public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnal public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) { nSNPs += eval == null ? 0 : 1; - if ( this.getMaster().evalContainsGenotypes && eval != null && eval.isSNP() && ((VariantBackedByGenotype)eval).getGenotype( DiploidGenotype.valueOf(eval.getAlternateBases())).isHet() ) + if ( this.getMaster().evalContainsGenotypes && eval != null && eval.isSNP() && ((VariantBackedByGenotype)eval).getGenotype( DiploidGenotype.valueOf(eval.getAlternateBase())).isHet() ) nHets++; return null; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java index 36ce49d51..26f89e760 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java @@ -86,7 +86,7 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA if (eval != null) { char alt = (eval.isSNP()) ? eval.getAlternativeBaseForSNP() : eval.getReference().charAt(0); if (dbSNP != null && dbSNP.isSNP()) - return !dbSNP.getAlternateBases().contains(String.valueOf(alt)); + return !dbSNP.getAlternateBase().contains(String.valueOf(alt)); } return false; } @@ -117,7 +117,7 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA if (dbsnp.isSNP() && eval.isSNP() && discordantP(dbsnp, eval)) { return String.format("Discordant [DBSNP %s] [EVAL %s]", dbsnp, eval); } else if (dbsnp.isIndel() && eval.isSNP()) { - return String.format("SNP-at-indel DBSNP=%s %s", dbsnp.getAlternateBases(), eval); + return String.format("SNP-at-indel DBSNP=%s %s", dbsnp.getAlternateBase(), eval); } else { return null; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java index 2e5a5407b..4e8b374f9 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java @@ -77,7 +77,7 @@ public class VariantsToVCF extends RefWalker { List gt = new ArrayList(); Map map = new HashMap(); if ( generateVCFRecord(tracker, ref, context, vcfheader, gt, map, sampleNames, out, SUPPRESS_MULTISTATE, VERBOSE) ) { - vcfwriter.addRecord(new VCFRecord(vcfheader, map, "GT:GQ:DP", gt)); + vcfwriter.addRecord(new VCFRecord(map, "GT:GQ:DP", gt)); //vcfwriter.addRecord(new VCFRecord(vcfheader, map, "GT", gt)); return 1; } @@ -123,7 +123,7 @@ public class VariantsToVCF extends RefWalker { int allele2 = BaseUtils.simpleBaseToBaseIndex(alleles.get(0).charAt(1)); if (allele2 >= 0 && allele2 != refbase) { alleleCounts[allele2]++; } - gt.add(new VCFGenotypeRecord(name, str, alleles, VCFGenotypeRecord.PHASE.UNPHASED, ref.getBase())); + gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED, str )); numSNPs++; snpQual += av.getVariationConfidence(); @@ -134,7 +134,7 @@ public class VariantsToVCF extends RefWalker { List alleles = new ArrayList(); alleles.add(ref.getBase() + "" + ref.getBase()); - gt.add(new VCFGenotypeRecord(name, str, alleles, VCFGenotypeRecord.PHASE.UNPHASED, ref.getBase())); + gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED, str )); numRefs++; } @@ -184,7 +184,7 @@ public class VariantsToVCF extends RefWalker { } else if (field == VCFHeader.HEADER_FIELDS.QUAL) { map.put(field, String.format("%d", snpQual > 99 ? 99 : (int) snpQual)); } else if (field == VCFHeader.HEADER_FIELDS.FILTER) { - map.put(field, String.valueOf("0")); + map.put(field, "0"); } else if (field == VCFHeader.HEADER_FIELDS.INFO) { String infostr = "."; ArrayList info = new ArrayList(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/BasicVariation.java b/java/src/org/broadinstitute/sting/utils/genotype/BasicVariation.java index 0f3f77967..c0e60c187 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/BasicVariation.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/BasicVariation.java @@ -2,6 +2,9 @@ package org.broadinstitute.sting.utils.genotype; import org.broadinstitute.sting.utils.GenomeLoc; +import java.util.List; +import java.util.ArrayList; + /** * User: aaron * Date: Sep 9, 2009 @@ -59,8 +62,7 @@ public class BasicVariation implements Variation { */ @Override public VARIANT_TYPE getType() { - if (mLength > 0) return VARIANT_TYPE.INDEL; - if (mLength < 0) return VARIANT_TYPE.DELETION; + if (mLength != 0) return VARIANT_TYPE.INDEL; return (isSNP()) ? VARIANT_TYPE.SNP : VARIANT_TYPE.REFERENCE; } @@ -81,10 +83,22 @@ public class BasicVariation implements Variation { } @Override - public String getAlternateBases() { + public String getAlternateBase() { return mBases; } + /** + * gets the alternate bases. Use this method if teh allele count is greater then 2 + * + * @return + */ + @Override + public List getAlternateBases() { + List list = new ArrayList(); + list.add(this.getAlternateBase()); + return list; + } + @Override public GenomeLoc getLocation() { return mLocation; @@ -95,6 +109,12 @@ public class BasicVariation implements Variation { return (mRef); } + /** are we bi-allelic? */ + @Override + public boolean isBiallelic() { + return true; + } + @Override public double getNegLog10PError() { return mConfidence; @@ -131,9 +151,9 @@ public class BasicVariation implements Variation { if (!this.isSNP()) throw new IllegalStateException("we're not a SNP"); // we know that if we're a snp, the reference is a single base, so charAt(0) is safe - if (getAlternateBases().charAt(0) == this.getReference().charAt(0)) - return getAlternateBases().charAt(1); - return getAlternateBases().charAt(0); + if (getAlternateBase().charAt(0) == this.getReference().charAt(0)) + return getAlternateBase().charAt(1); + return getAlternateBase().charAt(0); } /** @@ -146,9 +166,9 @@ public class BasicVariation implements Variation { if (!this.isSNP()) throw new IllegalStateException("we're not a SNP"); // we know that if we're a snp, the reference is a single base, so charAt(0) is safe - if (getAlternateBases().charAt(0) == this.getReference().charAt(0)) - return getAlternateBases().charAt(0); - return getAlternateBases().charAt(1); + if (getAlternateBase().charAt(0) == this.getReference().charAt(0)) + return getAlternateBase().charAt(0); + return getAlternateBase().charAt(1); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/VariantBackedByGenotype.java b/java/src/org/broadinstitute/sting/utils/genotype/VariantBackedByGenotype.java index dc3a41925..f6833696c 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/VariantBackedByGenotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/VariantBackedByGenotype.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.utils.genotype; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import java.util.List; /** * @author aaron @@ -13,13 +13,16 @@ public interface VariantBackedByGenotype { /** * get the likelihoods * - * @param x the genotype + * @return a map in lexigraphical order of the likelihoods + */ + public List getGenotypes(); + + /** + * get the likelihoods * - * @return an array in lexigraphical order of the likelihoods + * @return a map in lexigraphical order of the likelihoods */ public Genotype getGenotype(DiploidGenotype x); - - /** * do we have the specified genotype? not all backedByGenotypes * have all the genotype data. diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Variation.java b/java/src/org/broadinstitute/sting/utils/genotype/Variation.java index fdc8feaec..16dcde906 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/Variation.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Variation.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils.genotype; import org.broadinstitute.sting.utils.GenomeLoc; +import java.util.List; + /** * @author aaron *

@@ -12,7 +14,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; public interface Variation { // the types of variants we currently allow public enum VARIANT_TYPE { - SNP, INDEL, DELETION, REFERENCE // though reference is not really a variant + SNP, INDEL, REFERENCE // though reference is not really a variant } /** @@ -60,6 +62,9 @@ public interface Variation { */ public String getReference(); + /** are we bi-allelic? */ + public boolean isBiallelic(); + /** * get the -1 * (log 10 of the error value) * @@ -69,31 +74,43 @@ public interface Variation { /** * are we truely a variant, given a reference + * * @return false if we're a variant(indel, delete, SNP, etc), true if we're not */ public boolean isReference(); /** - * gets the alternate bases. If this is homref, throws an UnsupportedOperationException + * gets the alternate base. Use this method if we're biallelic + * * @return */ - public String getAlternateBases(); + public String getAlternateBase(); + + /** + * gets the alternate bases. Use this method if teh allele count is greater then 2 + * + * @return + */ + public List getAlternateBases(); /** * are we an insertion or a deletion? yes, then return true. No? Well, false then. + * * @return true if we're an insertion or deletion */ public boolean isIndel(); /** * gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP - * of + * of + * * @return a char, representing the alternate base */ public char getAlternativeBaseForSNP(); /** * gets the reference base is the case of a SNP. Throws an IllegalStateException if we're not a SNP + * * @return a char, representing the alternate base */ public char getReferenceForSNP(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 15aa1b289..b3c032c42 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -21,35 +21,33 @@ public class VCFGenotypeRecord { } // our phasing - private PHASE phaseType; - - // our reference bases(s) - private final char mReferenceBase; + private PHASE mPhaseType; // our bases(s) - private final List mAlleleBases = new ArrayList(); + private final List mGenotypeAlleles = new ArrayList(); // our mapping of the format mFields to values private final Map mFields = new HashMap(); // our sample name private final String mSampleName; + /** - * create a VCF record + * Create a VCF genotype record * - * @param keyValues the key values - * @param Alleles the alleles, one if we're halpoid, two if we're diploid - * @param phasing the phasing of the the genotype - * @param referenceBase the reference base + * @param sampleName + * @param genotypes + * @param phasing + * @param otherFlags */ - public VCFGenotypeRecord(String sampleName, Map keyValues, List Alleles, PHASE phasing, char referenceBase) { - mSampleName = sampleName; - mReferenceBase = referenceBase; - mFields.putAll(keyValues); - mAlleleBases.addAll(Alleles); - phaseType = phasing; + public VCFGenotypeRecord(String sampleName, List genotypes, PHASE phasing, Map otherFlags) { + this.mSampleName = sampleName; + this.mGenotypeAlleles.addAll(genotypes); + this.mPhaseType = phasing; + this.mFields.putAll(otherFlags); } + /** * determine the phase of the genotype * @@ -70,18 +68,55 @@ public class VCFGenotypeRecord { /** getter methods */ public PHASE getPhaseType() { - return phaseType; + return mPhaseType; } - public char getReference() { - return mReferenceBase; + public String getSampleName() { + return mSampleName; } - public List getAllele() { - return mAlleleBases; + public List getAlleles() { + return mGenotypeAlleles; } public Map getFields() { return mFields; } + + public String toGenotypeString(List altAlleles) { + String str = ""; + boolean first = true; + for (String allele : mGenotypeAlleles) { + str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0); + if (first) { + switch (mPhaseType) { + case UNPHASED: + str += "/"; + break; + case PHASED: + str += "|"; + break; + case PHASED_SWITCH_PROB: + str += "\\"; + break; + case UNKNOWN: + throw new UnsupportedOperationException("Unknown phase type"); + } + first = false; + } + } + return str; + } + + + public boolean equals(Object other) { + if (other instanceof VCFGenotypeRecord) { + if (((VCFGenotypeRecord) other).mPhaseType != this.mPhaseType) return false; + if (!((VCFGenotypeRecord) other).mGenotypeAlleles.equals(this.mGenotypeAlleles)) return false; + if (!((VCFGenotypeRecord) other).mFields.equals(mFields)) return false; + if (!((VCFGenotypeRecord) other).mSampleName.equals(this.mSampleName)) return false; + return true; + } + return false; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java index a5a0882f1..08b23d269 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java @@ -195,9 +195,9 @@ public class VCFReader implements Iterator, Iterable { genotypeRecords.add(getVCFGenotype(str, mFormatString, tokens[index], values.get(VCFHeader.HEADER_FIELDS.ALT).split(","), values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0))); index++; } - return new VCFRecord(mHeader, values, mFormatString, genotypeRecords); + return new VCFRecord(values, mFormatString, genotypeRecords); } - return new VCFRecord(mHeader, values); + return new VCFRecord(values); } /** @@ -216,7 +216,6 @@ public class VCFReader implements Iterator, Iterable { Map tagToValue = new HashMap(); VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN; List bases = new ArrayList(); - int addedCount = 0; String keyStrings[] = formatString.split(":"); for (String key : keyStrings) { String parse; @@ -236,19 +235,20 @@ public class VCFReader implements Iterator, Iterable { addAllele(m.group(1), altAlleles, referenceBase, bases); if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases); } - tagToValue.put(key, parse); - addedCount++; + else { + tagToValue.put(key, parse); + } if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1; genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length()); } // catch some common errors, either there are too many field keys or there are two many field values - if (keyStrings.length != tagToValue.size()) + if (keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0)) throw new RuntimeException("VCFReader: genotype value count doesn't match the key count (expected " + keyStrings.length + " but saw " + tagToValue.size() + ")"); else if (genotypeString.length() > 0) throw new RuntimeException("VCFReader: genotype string contained additional unprocessed fields: " + genotypeString + ". This most likely means that the format string is shorter then the value fields."); - return new VCFGenotypeRecord(sampleName, tagToValue, bases, phase, referenceBase); + return new VCFGenotypeRecord(sampleName, bases, phase, tagToValue); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 65deb37fe..e7799349d 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.utils.genotype.vcf; +import org.broadinstitute.sting.utils.Utils; + import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -8,90 +10,165 @@ import java.util.Map; /** the basic VCF record type */ public class VCFRecord { - // required field values - private final Map mValues = new HashMap(); + public static final String FIELD_SEPERATOR = "\t"; + // the reference base + private char mReferenceBase; + // our contig + private String mChrome; + // our position + private int mPosition; + // our id; set to '.' if not available + private String mID; + // the alternate bases + private final List mAlts = new ArrayList(); + // our qual value + private int mQual; + // our filter string + private String mFilterString; + // our info fields + private final Map mInfoFields = new HashMap(); + + private final String mGenotypeFormatString; // our genotype sample fields - private final List mGenotypeFields; - - // the format String, which specifies what each genotype can contain for values - private final String mFormatString; - - // the associated header - private final VCFHeader mHeader; + private final List mGenotypeFields = new ArrayList(); /** * given a VCF header, and the values for each of the columns, create a VCF record. * - * @param header the VCF header - * @param columnValues a mapping of header strings to values + * @param columnValues a mapping of header strings to values * @param formatString the format string for the genotype records * @param genotypeRecords the genotype records */ - public VCFRecord(VCFHeader header, Map columnValues, String formatString, List genotypeRecords) { - mHeader = header; - mValues.putAll(columnValues); - mFormatString = formatString; - mGenotypeFields = new ArrayList(); + public VCFRecord(Map columnValues, String formatString, List genotypeRecords) { + extractFields(columnValues); mGenotypeFields.addAll(genotypeRecords); + mGenotypeFormatString = formatString; + } + + /** + * create a VCF record + * + * @param referenceBase the reference base to use + * @param contig the contig this variant is on + * @param position our position + * @param ID our ID string + * @param altBases the list of alternate bases + * @param qual the qual field + * @param filters the filters used on this variant + * @param infoFields the information fields + * @param genotypeObjects the genotype objects + */ + public VCFRecord(char referenceBase, + String contig, + int position, + String ID, + List altBases, + int qual, + String filters, + Map infoFields, + String genotypeFormatString, + List genotypeObjects) { + this.setReferenceBase(referenceBase); + this.mChrome = contig; + this.setPosition(position); + this.mID = ID; + for (String alt : altBases) + this.addAlternateBase(alt); + this.setQual(qual); + this.setFilterString(filters); + this.mInfoFields.putAll(infoFields); + mGenotypeFormatString = genotypeFormatString; + this.mGenotypeFields.addAll(genotypeObjects); } /** * given a VCF header, and the values for each of the columns, create a VCF record. * - * @param header the VCF header - * @param columnValues a mapping of header strings to values + * @param columnValues a mapping of header strings to values */ - public VCFRecord(VCFHeader header, Map columnValues) { - mHeader = header; - mValues.putAll(columnValues); - mGenotypeFields = null; - mFormatString = null; + public VCFRecord(Map columnValues) { + extractFields(columnValues); + mGenotypeFormatString = null; + } + + /** + * extract the field values from the passed in array + * + * @param columnValues a map of the header fields to values + */ + private void extractFields(Map columnValues) { + for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) { + switch (val) { + case CHROM: + this.setChomosome(columnValues.get(val)); + break; + case POS: + this.setPosition(Integer.valueOf(columnValues.get(val))); + break; + case ID: + this.setID(columnValues.get(val)); + break; + case REF: + if (columnValues.get(val).length() != 1) + throw new IllegalArgumentException("Reference base should be a single character"); + this.setReferenceBase(columnValues.get(val).charAt(0)); + break; + case ALT: + String values[] = columnValues.get(val).split(","); + for (String alt : values) + addAlternateBase(alt); + break; + case QUAL: + this.setQual(Integer.valueOf(columnValues.get(val))); + break; + case FILTER: + this.setFilterString(columnValues.get(val)); + break; + case INFO: + String vals[] = columnValues.get(val).split(";"); + for (String alt : vals) { + String keyVal[] = alt.split("="); + if (keyVal.length == 1 && keyVal[0].equals(".")) { + this.addInfoField(keyVal[0], ""); + break; + } + if (keyVal.length != 2) + throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt); + this.addInfoField(keyVal[0], keyVal[1]); + } + break; + } + } } /** * do we have genotyping data + * * @return true if we have genotyping data, false otherwise */ + public boolean hasGenotypeData() { - if (mGenotypeFields==null) { + if (mGenotypeFields.size() < 1) { return false; } return true; } - /** - * get the format string - * @return the format sting, null if it doesn't exist - */ - public String getFormatString() { - return mFormatString; - } - - /** - * get a required field, given the field tag - * - * @param field the key for the field - * - * @return the field value - */ - public String getValue(VCFHeader.HEADER_FIELDS field) { - return mValues.get(field); - } - /** @return the string for the chromosome that this VCF record is associated with */ public String getChromosome() { - return mValues.get(VCFHeader.HEADER_FIELDS.CHROM); + return this.mChrome; } + /** @return this VCF records position on the specified chromosome */ public long getPosition() { - return Long.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.POS)); + return this.mPosition; } /** @return the ID value for this record */ public String getID() { - return mValues.get(VCFHeader.HEADER_FIELDS.ID); + return this.mID; } /** @@ -100,8 +177,7 @@ public class VCFRecord { * @return either A, T, C, G, or N */ public char getReferenceBase() { - // TODO: this field isn't validated correctly - return mValues.get(VCFHeader.HEADER_FIELDS.REF).charAt(0); + return this.mReferenceBase; } /** @@ -109,20 +185,17 @@ public class VCFRecord { * * @return an array of strings representing the alt alleles, or null if there are none */ - public String[] getAlternateAlleles() { - if (mValues.get(VCFHeader.HEADER_FIELDS.ALT).trim().equals(".")) { - return null; - } - return mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(","); + public List getAlternateAlleles() { + return this.mAlts; } public boolean hasAlternateAllele() { - return getAlternateAlleles() != null; + return getAlternateAlleles().size() > 0; } /** @return the phred-scaled quality score */ public int getQual() { - return Integer.valueOf(mValues.get(VCFHeader.HEADER_FIELDS.QUAL)); + return this.mQual; } /** @@ -131,10 +204,10 @@ public class VCFRecord { * @return an array of strings representing the filtering criteria, or null if none were applied */ public String[] getFilteringCodes() { - if (mValues.get(VCFHeader.HEADER_FIELDS.FILTER).trim().equals("0")) { - return null; - } - return mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(";"); + //if (this.mFilterString.equals("0")) { + // return null; + //} + return this.mFilterString.split(";"); } public boolean hasFilteringCodes() { @@ -147,25 +220,18 @@ public class VCFRecord { * @return a map, of the info key-value pairs */ public Map getInfoValues() { - Map ret = new HashMap(); - String infoSplit[] = mValues.get(VCFHeader.HEADER_FIELDS.INFO).split(";"); - for (String s : infoSplit) { - String keyValue[] = s.split("="); - if (keyValue.length != 2) - throw new RuntimeException("Key value pairs must have both a key and a value; pair: " + s); - ret.put(keyValue[0], keyValue[1]); - } - return ret; + return this.mInfoFields; } /** @return the number of columnsof data we're storing */ public int getColumnCount() { - if (this.hasGenotypeData()) return mGenotypeFields.size() + mValues.size(); - return mValues.size(); + if (this.hasGenotypeData()) return mGenotypeFields.size() + VCFHeader.HEADER_FIELDS.values().length; + return VCFHeader.HEADER_FIELDS.values().length; } /** * return the mapping of the format tags to the specified sample's values + * * @return a VCFGenotypeRecord */ public List getVCFGenotypeRecords() { @@ -174,9 +240,130 @@ public class VCFRecord { /** @return a List of the sample names */ public String[] getSampleNames() { - String ret[] = new String[mHeader.getGenotypeSamples().size()]; - mHeader.getGenotypeSamples().toArray(ret); - return ret; + String names[] = new String[mGenotypeFields.size()]; + int index = 0; + for (VCFGenotypeRecord rec : this.mGenotypeFields) + names[index++] = rec.getSampleName(); + return names; + } + + public String getGenotypeFormatString() { + return mGenotypeFormatString; + }// the formatting string for our genotype records + + public void setReferenceBase(char referenceBase) { + // upppercase the character + referenceBase = (char) ((referenceBase > 96) ? referenceBase - 32 : referenceBase); + if (referenceBase != 'A' && referenceBase != 'C' && referenceBase != 'T' && referenceBase != 'G' && referenceBase != 'N') + throw new IllegalArgumentException("Reference base must be one of A,C,G,T,N, we saw: " + referenceBase); + this.mReferenceBase = referenceBase; + } + + public void setPosition(int mPosition) { + if (mPosition < 0) + throw new IllegalArgumentException("Position values must be greater than 0"); + this.mPosition = mPosition; + } + + public void setChomosome(String mChrome) { + this.mChrome = mChrome; + } + + public void setID(String mID) { + this.mID = mID; + } + + public void setQual(int mQual) { + if (mQual < 0) + throw new IllegalArgumentException("Qual values must be greater than 0"); + this.mQual = mQual; + } + + public void setFilterString(String mFilterString) { + this.mFilterString = mFilterString; + } + + public void addGenotypeFields(VCFGenotypeRecord mGenotypeFields) { + this.mGenotypeFields.add(mGenotypeFields); + } + + public void addAlternateBase(String base) { + if (base.length() == 1) { + char nuc = (char) ((base.charAt(0) > 96) ? base.charAt(0) - 32 : base.charAt(0)); + if (nuc != 'A' && nuc != 'C' && nuc != 'T' && nuc != 'G' && nuc != '.') + throw new IllegalArgumentException("Alternate base must be either A,C,T,G,. or if an indel it must contain length information: " + base); + } else { + // we must be an indel, check that the first character is I or D + char nuc = (char) ((base.charAt(0) > 96) ? base.charAt(0) - 32 : base.charAt(0)); + if (nuc != 'I' && nuc != 'D') + throw new IllegalArgumentException("Alternate bases of length greater then one must be an indel: " + base); + } + this.mAlts.add(base); + } + + public void addInfoField(String key, String value) { + this.mInfoFields.put(key, value); + } + + public String toString() { + StringBuilder builder = new StringBuilder(); + + // else builder.append(FIELD_SEPERATOR + record.getValue(field)); + // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO + builder.append(getChromosome() + FIELD_SEPERATOR); + builder.append(getPosition() + FIELD_SEPERATOR); + builder.append(getID() + FIELD_SEPERATOR); + builder.append(getReferenceBase() + FIELD_SEPERATOR); + String alts = ""; + for (String str : this.getAlternateAlleles()) alts += str + ","; + builder.append(alts.substring(0, alts.length() - 1) + FIELD_SEPERATOR); + builder.append(getQual() + FIELD_SEPERATOR); + builder.append(Utils.join(";", getFilteringCodes()) + FIELD_SEPERATOR); + String info = ""; + for (String str : this.getInfoValues().keySet()) { + if (str.equals(".")) + info = "."; + else + info += str + "=" + getInfoValues().get(str) + ";"; + } + + if (info.length() > 1) builder.append(info.substring(0, info.length() - 1)); + else builder.append(info); + + if (this.hasGenotypeData()) { + builder.append(FIELD_SEPERATOR + this.getGenotypeFormatString()); + for (VCFGenotypeRecord rec : this.getVCFGenotypeRecords()) { + builder.append(FIELD_SEPERATOR); + if (!rec.toGenotypeString(this.mAlts).equals("")) + builder.append(rec.toGenotypeString(this.mAlts)); + for (String s : rec.getFields().keySet()) { + if (rec.getFields().get(s).equals("")) continue; + builder.append(":"); + builder.append(rec.getFields().get(s)); + } + } + } + return builder.toString(); + } + + /** + * compare two VCF records + * + * @param other the other VCF record + * + * @return true if they're equal + */ + public boolean equals(VCFRecord other) { + if (!this.mAlts.equals(other.mAlts)) return false; + if (this.mReferenceBase != other.mReferenceBase) return false; + if (!this.mChrome.equals(other.mChrome)) return false; + if (this.mPosition != other.mPosition) return false; + if (!this.mID.equals(other.mID)) return false; + if (this.mQual != other.mQual) return false; + if (!this.mFilterString.equals(other.mFilterString)) return false; + if (!this.mInfoFields.equals(other.mInfoFields)) return false; + if (!this.mGenotypeFields.equals(other.mGenotypeFields)) return false; + return true; } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index 8451f5172..3da5c2757 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -61,33 +61,13 @@ public class VCFWriter { throw new RuntimeException("Record has " + record.getColumnCount() + " columns, when is should have " + mHeader.getColumnCount()); } - StringBuilder builder = new StringBuilder(); + String vcfString = record.toString(); + try { + mWriter.write(vcfString + "\n"); + } catch (IOException e) { + throw new RuntimeException("Unable to write the VCF object to a file"); + } - // first output the required fields in order - boolean first = true; - for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields()) { - if (first) { - first = false; - builder.append(record.getValue(field)); - } else builder.append(FIELD_SEPERATOR + record.getValue(field)); - } - if (record.hasGenotypeData()) { - builder.append(FIELD_SEPERATOR + record.getFormatString()); - for (VCFGenotypeRecord rec : record.getVCFGenotypeRecords()) { - builder.append(FIELD_SEPERATOR); - boolean ft = true; - for (String s : rec.getFields().keySet()) { - if (!ft) builder.append(":"); - else ft = true; - builder.append(rec.getFields().get(s)); - } - } - try { - mWriter.write(builder.toString() + "\n"); - } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to a file"); - } - } } /** attempt to close the VCF file */ diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java new file mode 100755 index 000000000..feb6a1943 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/refdata/RodVCFTest.java @@ -0,0 +1,179 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; +import org.junit.Assert; +import static org.junit.Assert.fail; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + + +/** + * @author aaron + *

+ * Class RodVCFTest + *

+ * test out the rod VCF + */ +public class RodVCFTest extends BaseTest { + + private static IndexedFastaSequenceFile seq; + private static File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf"); + + @BeforeClass + public static void beforeTests() { + try { + seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta")); + } catch (FileNotFoundException e) { + throw new StingException("unable to load the sequence dictionary"); + } + GenomeLocParser.setupRefContigOrdering(seq); + + } + + private RodVCF getVCFObject() { + RodVCF vcf = new RodVCF("VCF"); + VCFHeader header = null; + try { + header = (VCFHeader) vcf.initialize(vcfFile); + } catch (FileNotFoundException e) { + fail("Unable to open VCF file"); + } + header.checkVCFVersion(); + return vcf; + } + + @Test + public void testInitialize() { + getVCFObject(); + } + + @Test + public void testToIterator() { + RodVCF vcf = getVCFObject(); + VCFReader reader = new VCFReader(vcfFile); + Iterator iter = vcf.createIterator("VCF", vcfFile); + while (iter.hasNext()) { + VCFRecord rec1 = reader.next(); + VCFRecord rec2 = iter.next().mCurrentRecord; + if (!rec1.equals(rec2)) { + fail("VCF record rec1 != rec2"); + } + } + } + + @Test + public void testToMAF() { + RodVCF vcf = getVCFObject(); + Iterator iter = vcf.createIterator("VCF", vcfFile); + RodVCF rec = iter.next(); + Assert.assertEquals(rec.getNonRefAlleleFrequency(), 0.786, 0.00001); + rec = iter.next(); + rec = iter.next(); + rec = iter.next(); + Assert.assertEquals(rec.getNonRefAlleleFrequency(), 0.0, 0.00001); + } + + @Test + public void testToString() { + // slightly altered line, due to map ordering + String firstLine = "20\t14370\trs6054257\tG\tA\t29\t0\tDP=258;AF=0.786;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n"; + RodVCF vcf = getVCFObject(); + VCFReader reader = new VCFReader(vcfFile); + Iterator iter = vcf.createIterator("VCF", vcfFile); + boolean first = true; + while (iter.hasNext()) { + VCFRecord rec1 = reader.next(); + VCFRecord rec2 = iter.next().mCurrentRecord; + if (!rec1.toString().equals(rec2.toString())) { + fail("VCF record rec1.toString() != rec2.toString()"); + } + // verify the first line too + if (first) { + if (!firstLine.equals(rec1.toString() + "\n")) { + fail("VCF record rec1.toString() != expected string :\n" + rec1.toString() + firstLine); + } + first = false; + } + } + } + + @Test + public void testType() { + RodVCF vcf = getVCFObject(); + Iterator iter = vcf.createIterator("VCF", vcfFile); + RodVCF rec = iter.next(); + Assert.assertTrue(rec.isSNP()); + rec = iter.next(); + rec = iter.next(); + rec = iter.next(); + rec = iter.next(); + Assert.assertTrue(rec.isIndel()); + Assert.assertTrue(rec.isInsertion()); + Assert.assertTrue(rec.isDeletion()); + } + + @Test + public void testGetGenotypes() { + RodVCF vcf = getVCFObject(); + Iterator iter = vcf.createIterator("VCF", vcfFile); + RodVCF rec = iter.next(); + // we should get back a ref 'G' and an alt 'A' + List list = rec.getGenotypes(); + List knowngenotypes = new ArrayList(); + knowngenotypes.add("GG"); + knowngenotypes.add("AG"); + knowngenotypes.add("AA"); + Assert.assertEquals(3, list.size()); + for (Genotype g: list) { + Assert.assertTrue(knowngenotypes.contains(g.getBases())); + } + } + @Test + public void testGetGenotypesQual() { + RodVCF vcf = getVCFObject(); + Iterator iter = vcf.createIterator("VCF", vcfFile); + RodVCF rec = iter.next(); + // we should get back a ref 'G' and an alt 'A' + List list = rec.getGenotypes(); + Assert.assertEquals(4.8,list.get(0).getNegLog10PError(),0.0001); + Assert.assertEquals(4.8,list.get(1).getNegLog10PError(),0.0001); + Assert.assertEquals(4.3,list.get(2).getNegLog10PError(),0.0001); + } + @Test + public void testHasGenotypes() { + RodVCF vcf = getVCFObject(); + Iterator iter = vcf.createIterator("VCF", vcfFile); + RodVCF rec = iter.next(); + Assert.assertTrue(rec.hasGenotype(DiploidGenotype.valueOf("GG"))); + Assert.assertTrue(rec.hasGenotype(DiploidGenotype.valueOf("AG"))); + Assert.assertTrue(rec.hasGenotype(DiploidGenotype.valueOf("AA"))); + Assert.assertTrue(!rec.hasGenotype(DiploidGenotype.valueOf("TT"))); + } + + @Test + public void testGetLocation() { + RodVCF vcf = getVCFObject(); + Iterator iter = vcf.createIterator("VCF", vcfFile); + RodVCF rec = iter.next(); + GenomeLoc loc = rec.getLocation(); + Assert.assertEquals(14370,loc.getStart()); + Assert.assertEquals(14370,loc.getStop()); + Assert.assertTrue(loc.getContig().equals("20")); + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 7eaf0404e..9ae04eaee 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -8,64 +8,63 @@ import java.util.Arrays; public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testIntervals() { - - String[] md5DoC = {"5f176d27eba04c8e08b54b60fd96758e", "21c8e1f9dc65fdfb39347547f9b04011"}; + String[] md5DoC = {"eada262e03738876a2b5b94d56f0951e", "21c8e1f9dc65fdfb39347547f9b04011"}; WalkerTestSpec spec1 = new WalkerTestSpec( "-T VariantFiltration -X DepthOfCoverage:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, Arrays.asList(md5DoC)); executeTest("testDoCFilter", spec1); - String[] md5AlleleBalance = {"dfa47f9465214445b737ec49025d6b5f", "a13e4ce6260bf9f33ca99dc808b8e6ad"}; + String[] md5AlleleBalance = {"0ade6c97a46245a65558d115ef1e3956", "a13e4ce6260bf9f33ca99dc808b8e6ad"}; WalkerTestSpec spec2 = new WalkerTestSpec( "-T VariantFiltration -X AlleleBalance:low=0.25,high=0.75 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, Arrays.asList(md5AlleleBalance)); executeTest("testAlleleBalanceFilter", spec2); - String[] md5Strand = {"2d32ddd6daccd08f3df148f4d373fa5e", "0f7db0aad764268ee8fa3b857df8d87d"}; + String[] md5Strand = {"1a9568e3f7e88483e7fbc5f981a7973c", "0f7db0aad764268ee8fa3b857df8d87d"}; WalkerTestSpec spec3 = new WalkerTestSpec( "-T VariantFiltration -X FisherStrand:pvalue=0.0001 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, Arrays.asList(md5Strand)); executeTest("testStrandFilter", spec3); - String[] md5Lod = {"157e7beab140f4252ed41ebfc9a508f2", "7e0c4f2b0fda85fd2891eee76c396a55"}; + String[] md5Lod = {"64bc742d0f44a7279ac6c4ad5c28da28", "7e0c4f2b0fda85fd2891eee76c396a55"}; WalkerTestSpec spec4 = new WalkerTestSpec( "-T VariantFiltration -X LodThreshold:lod=10 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, Arrays.asList(md5Lod)); executeTest("testLodFilter", spec4); - String[] md5MQ0 = {"15738f3717dfa3192566a7ddf301fe11", "3203de335621851bccf596242b079e23"}; + String[] md5MQ0 = {"1201176efca694e2566a9de1508517b8", "3203de335621851bccf596242b079e23"}; WalkerTestSpec spec5 = new WalkerTestSpec( "-T VariantFiltration -X MappingQualityZero:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, Arrays.asList(md5MQ0)); executeTest("testMappingQuality0Filter", spec5); - String[] md5MQ = {"e7fb1f166d2359f653f98f8215215aca", "ecc777feedea61f7b570d114c2ab89b1"}; + String[] md5MQ = {"ed922e58c856e1bc3f125b635dea48fc", "ecc777feedea61f7b570d114c2ab89b1"}; WalkerTestSpec spec6 = new WalkerTestSpec( "-T VariantFiltration -X MappingQuality:min=20 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, Arrays.asList(md5MQ)); executeTest("testRMSMappingQualityFilter", spec6); - String[] md5OnOff = {"fe4acc1505f19480f275d7a96b0e3ddb", "67f2e1bc025833b0fa31f47195198997"}; + String[] md5OnOff = {"8a00c5ad80c43e468186ce0147553048", "67f2e1bc025833b0fa31f47195198997"}; WalkerTestSpec spec7 = new WalkerTestSpec( "-T VariantFiltration -X OnOffGenotypeRatio:threshold=0.9 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, Arrays.asList(md5OnOff)); executeTest("testOnOffGenotypeFilter", spec7); - String[] md5Clusters = {"f5c7c5da0198c4aaafdfcfb3c356eedc", "8fa6b6ffc93ee7fb8d6b52a7fb7815ef"}; + String[] md5Clusters = {"ee9d8039490354cc7f2c574b5d10c7d8", "8fa6b6ffc93ee7fb8d6b52a7fb7815ef"}; WalkerTestSpec spec8 = new WalkerTestSpec( "-T VariantFiltration -X ClusteredSnps:window=10,snps=3 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, Arrays.asList(md5Clusters)); executeTest("testClusteredSnpsFilter", spec8); - String[] md5Indels = {"30bf4c764f6dfd006d919ecaceee0166", "8e0e915a1cb63d7049e0671ed00101fe"}; + String[] md5Indels = {"90eb46f8d2bf4c0a72716e35c914839d", "8e0e915a1cb63d7049e0671ed00101fe"}; WalkerTestSpec spec9 = new WalkerTestSpec( "-T VariantFiltration -X IndelArtifact -B indels,PointIndel,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.indels -B cleaned,CleanedOutSNP,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.realigner_badsnps -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878", 2, diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java index f6b3921e4..dc215b05a 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java @@ -41,11 +41,10 @@ public class VCFReaderTest extends BaseTest { char referenceBase = 'N'; VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase); Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType()); - Assert.assertEquals(referenceBase,rec.getReference()); - Assert.assertEquals("N",rec.getAllele().get(0)); - Assert.assertEquals("A",rec.getAllele().get(1)); + Assert.assertEquals("N",rec.getAlleles().get(0)); + Assert.assertEquals("A",rec.getAlleles().get(1)); Map values = rec.getFields(); - Assert.assertEquals(4,values.size()); + Assert.assertEquals(3,values.size()); Assert.assertTrue(values.get("B").equals("2")); Assert.assertTrue(values.get("C").equals("3")); Assert.assertTrue(values.get("D").equals("4")); @@ -63,11 +62,10 @@ public class VCFReaderTest extends BaseTest { char referenceBase = 'N'; VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase); Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType()); - Assert.assertEquals(referenceBase,rec.getReference()); - Assert.assertEquals("N",rec.getAllele().get(0)); - Assert.assertEquals("A",rec.getAllele().get(1)); + Assert.assertEquals("N",rec.getAlleles().get(0)); + Assert.assertEquals("A",rec.getAlleles().get(1)); Map values = rec.getFields(); - Assert.assertEquals(4,values.size()); + Assert.assertEquals(3,values.size()); Assert.assertTrue(values.get("B").equals("")); Assert.assertTrue(values.get("C").equals("")); Assert.assertTrue(values.get("D").equals("4")); @@ -84,11 +82,10 @@ public class VCFReaderTest extends BaseTest { char referenceBase = 'N'; VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase); Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType()); - Assert.assertEquals(referenceBase,rec.getReference()); - Assert.assertEquals("N",rec.getAllele().get(0)); - Assert.assertEquals("A",rec.getAllele().get(1)); + Assert.assertEquals("N",rec.getAlleles().get(0)); + Assert.assertEquals("A",rec.getAlleles().get(1)); Map values = rec.getFields(); - Assert.assertEquals(4,values.size()); + Assert.assertEquals(3,values.size()); Assert.assertTrue(values.get("B").equals("")); Assert.assertTrue(values.get("C").equals("")); Assert.assertTrue(values.get("D").equals("")); diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java new file mode 100755 index 000000000..4a44d6e99 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.BaseTest; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + + +/** + * + * @author aaron + * + * Class VCFRecordTest + * + * test the basic functionality of the vcf record + */ +public class VCFRecordTest extends BaseTest { + + private VCFRecord makeFakeVCFRecord() { + List altBases = new ArrayList(); + altBases.add("C"); + altBases.add("D1"); + Map infoFields = new HashMap(); + infoFields.put("DP","50"); + List genotypeObjects = new ArrayList(); + Map keyValues = new HashMap(); + keyValues.put("AA","2"); + List Alleles = new ArrayList(); + Alleles.add("A"); + genotypeObjects.add(new VCFGenotypeRecord("SampleName", Alleles, VCFGenotypeRecord.PHASE.PHASED, keyValues)); + return new VCFRecord('A',"chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",genotypeObjects); + } + + + @Test + public void testGetGenotypes() { + VCFRecord rec = makeFakeVCFRecord(); + List genotypeObjects = rec.getVCFGenotypeRecords(); + Assert.assertEquals(1,genotypeObjects.size()); + Assert.assertTrue(genotypeObjects.get(0).getSampleName().equals("SampleName")); + } +} diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java index 2241646ec..7d1a62345 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java @@ -60,18 +60,24 @@ public class VCFWriterTest extends BaseTest { * @return a VCFRecord */ private VCFRecord createVCFRecord(VCFHeader header) { - Map map = new HashMap(); - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) - map.put(field,String.valueOf(1)); + List altBases = new ArrayList(); + altBases.add("C"); + altBases.add("D1"); + Map infoFields = new HashMap(); + infoFields.put("DP","50"); + List gt = new ArrayList(); for (String name : header.getGenotypeSamples()) { Map str = new HashMap(); - str.put("key","0|0"); - List alleles = new ArrayList(); - alleles.add("AAA"); - gt.add(new VCFGenotypeRecord(name,str,alleles, VCFGenotypeRecord.PHASE.PHASED,'A')); + str.put("bb","0"); + + List myAlleles = new ArrayList(); + myAlleles.add("C"); + myAlleles.add("D1"); + gt.add(new VCFGenotypeRecord(name, myAlleles, VCFGenotypeRecord.PHASE.PHASED, str)); } - return new VCFRecord(header,map,"GT",gt); + return new VCFRecord('A',"chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",gt); + }