From b6f8e33f4c754b5638ecbc74447fa8bd98146b17 Mon Sep 17 00:00:00 2001 From: ebanks Date: Sun, 6 Dec 2009 06:48:03 +0000 Subject: [PATCH] Stage 2 of Variation refactoring: VCFRecord now implements Variation, VCFGenotypeRecord now implements Genotype. Because of this change, RodVCF is now just a wrapper around the VCFRecord and does nothing else. Also, one can call toVariation on the VCFGenotypeRecord and it returns the VCFRecord. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2271 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/RodVCF.java | 228 +++++------------- .../walkers/vcftools/VCFSubsetWalker.java | 10 +- .../sting/playground/tools/vcf/VCFTool.java | 4 +- .../sting/utils/genotype/Genotype.java | 2 +- .../utils/genotype/vcf/VCFGenotypeRecord.java | 83 ++++++- .../sting/utils/genotype/vcf/VCFRecord.java | 215 +++++++++++++---- .../utils/genotype/vcf/VCFReaderTest.java | 16 ++ .../utils/genotype/vcf/VCFRecordTest.java | 18 ++ .../utils/genotype/vcf/VCFWriterTest.java | 17 ++ 9 files changed, 358 insertions(+), 235 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java index ecb9055a1..8253a30bc 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java @@ -1,10 +1,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.genotype.BasicGenotype; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; @@ -13,7 +10,6 @@ import org.broadinstitute.sting.utils.genotype.vcf.*; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; -import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -41,27 +37,30 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, } public void assertNotNull() { - if (mCurrentRecord == null) { + if ( mCurrentRecord == null ) { throw new UnsupportedOperationException("The current Record is null"); } } + public void assertBiAllelic() { + if ( !isBiallelic() ) + throw new StingException("This VCF rod is not bi-allelic."); + } + @Override public boolean parseLine(Object header, String[] parts) throws IOException { - throw new UnsupportedOperationException("We don't support the parse line"); + throw new UnsupportedOperationException("RodVCF does not support the parseLine method"); } public Object initialize(final File source) throws FileNotFoundException { - if (mReader == null) mReader = new VCFReader(source); + if ( mReader == null ) + mReader = new VCFReader(source); return mReader.getHeader(); } @Override public String toString() { - if (this.mCurrentRecord != null) - return this.mCurrentRecord.toStringEncoding(mReader.getHeader()); - else - return ""; + return (mCurrentRecord != null ? mCurrentRecord.toStringEncoding(mReader.getHeader()) : ""); } public static RodVCF createIterator(String name, File file) { @@ -74,13 +73,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, return vcf; } - public void assertBiAllelic() { - if (!this.isBiallelic()) throw new StingException("We're not bi-allelic."); - } - public boolean hasNonRefAlleleFrequency() { - return (this.mCurrentRecord.getInfoValues().containsKey("AF") || - (this.mCurrentRecord.getInfoValues().containsKey("AC") && this.mCurrentRecord.getInfoValues().containsKey("AN"))); + assertNotNull(); + return mCurrentRecord.getNonRefAlleleFrequency() > 0.0; } /** @@ -88,25 +83,13 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return VariantFrequency with the stored frequency */ - @Override public double getNonRefAlleleFrequency() { assertNotNull(); - if (this.mCurrentRecord.getInfoValues().containsKey("AF")) { - return Double.valueOf(this.mCurrentRecord.getInfoValues().get("AF")); - } else { - // this is the poor man's AF - if (this.mCurrentRecord.getInfoValues().containsKey("AC") && this.mCurrentRecord.getInfoValues().containsKey("AN")) { - String splt[] = mCurrentRecord.getInfoValues().get("AC").split(","); - if (splt.length > 0) { - return (Double.valueOf(splt[0]) / Double.valueOf(mCurrentRecord.getInfoValues().get("AN"))); - } - } - } - - return 0.0; + return mCurrentRecord.getNonRefAlleleFrequency(); } public boolean hasStrandBias() { + assertNotNull(); return this.mCurrentRecord.getInfoValues().containsKey("SB"); } @@ -116,19 +99,13 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * @return StrandBias with the stored slod */ public double getStrandBias() { - assertNotNull(); - if (this.mCurrentRecord.getInfoValues().containsKey("SB")) - return Double.valueOf(this.mCurrentRecord.getInfoValues().get("SB")); - return 0.0; + return hasStrandBias() ? Double.valueOf(this.mCurrentRecord.getInfoValues().get("SB")) : 0.0; } /** @return the VARIANT_TYPE of the current variant */ - @Override public VARIANT_TYPE getType() { - if (this.isSNP()) return VARIANT_TYPE.SNP; - else if (this.isInsertion()) return VARIANT_TYPE.INSERTION; - else if (this.isDeletion()) return VARIANT_TYPE.DELETION; - return VARIANT_TYPE.REFERENCE; + assertNotNull(); + return mCurrentRecord.getType(); } /** @@ -136,15 +113,10 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return true if we're a SNP */ - @Override public boolean isSNP() { - this.assertNotNull(); + assertNotNull(); assertBiAllelic(); - for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) { - if (alt.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE) - return false; - } - return true; + return mCurrentRecord.isSNP(); } /** @@ -152,17 +124,10 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return true if we are, false otherwise */ - @Override public boolean isInsertion() { - this.assertNotNull(); + assertNotNull(); assertBiAllelic(); - if (!mCurrentRecord.hasAlternateAllele()) - return false; - for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) { - if (alt.getType() == VCFGenotypeEncoding.TYPE.INSERTION) - return true; - } - return false; + return mCurrentRecord.isInsertion(); } /** @@ -170,23 +135,16 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return true if we are, false otherwise */ - @Override public boolean isDeletion() { - this.assertNotNull(); + assertNotNull(); assertBiAllelic(); - if (!mCurrentRecord.hasAlternateAllele()) - return false; - for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) { - if (alt.getType() == VCFGenotypeEncoding.TYPE.DELETION) - return true; - } - return false; + return mCurrentRecord.isDeletion(); } @Override public GenomeLoc getLocation() { - this.assertNotNull(); - return GenomeLocParser.createGenomeLoc(mCurrentRecord.getChromosome(), mCurrentRecord.getPosition(), mCurrentRecord.getPosition()); + assertNotNull(); + return mCurrentRecord.getLocation(); } /** @@ -194,15 +152,15 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return the reference base or bases, as a string */ - @Override public String getReference() { - return String.valueOf(mCurrentRecord.getReferenceBase()); + assertNotNull(); + return mCurrentRecord.getReference(); } /** are we bi-allelic? */ - @Override public boolean isBiallelic() { - return (this.getAlternateAlleleList().size() == 1); + assertNotNull(); + return mCurrentRecord.isBiallelic(); } /** @@ -210,10 +168,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return the log based error estimate */ - @Override public double getNegLog10PError() { - // we're -10 log(error), we have to divide by 10 - return mCurrentRecord.getQual() / 10.0; + assertNotNull(); + return mCurrentRecord.getNegLog10PError(); } /** @@ -224,12 +181,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return an alternate allele list */ - @Override public List getAlternateAlleleList() { - List list = new ArrayList(); - for (VCFGenotypeEncoding enc : mCurrentRecord.getAlternateAlleles()) - list.add(enc.toString()); - return list; + assertNotNull(); + return mCurrentRecord.getAlternateAlleleList(); } /** @@ -239,12 +193,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return an alternate allele list */ - @Override public List getAlleleList() { - List ret = new ArrayList(); - ret.add(String.valueOf(mCurrentRecord.getReferenceBase())); - ret.addAll(getAlternateAlleleList()); - return ret; + assertNotNull(); + return mCurrentRecord.getAlleleList(); } /** @@ -252,9 +203,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return false if we're a variant(indel, delete, SNP, etc), true if we're not */ - @Override public boolean isReference() { - return (!mCurrentRecord.hasAlternateAllele()); + assertNotNull(); + return mCurrentRecord.isReference(); } /** @@ -262,9 +213,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return true if we're an insertion or deletion */ - @Override public boolean isIndel() { - return (!isSNP()); + assertNotNull(); + return mCurrentRecord.isIndel(); } /** @@ -273,11 +224,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return a char, representing the alternate base */ - @Override public char getAlternativeBaseForSNP() { - if (!isSNP()) throw new IllegalStateException("we're not a SNP"); - if (mCurrentRecord.getAlternateAlleles().size() != 1) throw new UnsupportedOperationException("We're not a biallelic VCF site"); - return (mCurrentRecord.getAlternateAlleles().get(0).toString()).charAt(0); + assertNotNull(); + return mCurrentRecord.getAlternativeBaseForSNP(); } /** @@ -285,10 +234,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return a char, representing the alternate base */ - @Override public char getReferenceForSNP() { - if (!isSNP()) throw new IllegalStateException("we're not a SNP"); - return mCurrentRecord.getReferenceBase(); + assertNotNull(); + return mCurrentRecord.getReferenceForSNP(); } /** @@ -296,28 +244,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return a map in lexigraphical order of the genotypes */ - @Override public Genotype getCalledGenotype() { - double refQual = (this.getNegLog10PError()); - - if (this.mCurrentRecord != null && this.mCurrentRecord.hasGenotypeData()) { - List lst = this.mCurrentRecord.getVCFGenotypeRecords(); - if (lst.size() != 1) { - throw new IllegalStateException("VCF object does not have one and only one genotype record"); - } - VCFGenotypeRecord rec = lst.get(0); - if ( rec.isEmptyGenotype() ) - return null; - - double qual = 0; - if (rec.getAlleles().equals(this.getReference())) - qual = refQual; - else if (rec.getFields().containsKey("GQ")) - qual = Double.valueOf(rec.getFields().get("GQ")) / 10.0; - int coverage = (rec.getFields().containsKey("RD") ? Integer.valueOf(rec.getFields().get("RD")) : 0); - return new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", rec.getAlleles()), qual, coverage, rec.getSampleName()); - } - return null; + assertNotNull(); + return mCurrentRecord.getCalledGenotype(); } /** @@ -325,53 +254,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return a list of the genotypes */ - @Override public List getGenotypes() { - List genotypes = new ArrayList(); - if (!this.mCurrentRecord.hasGenotypeData()) { - return genotypes; - } - double refQual = (this.getNegLog10PError()); - // add the reference - for (VCFGenotypeRecord rec : mCurrentRecord.getVCFGenotypeRecords()) { - if ( rec.isEmptyGenotype() ) - continue; - - double qual = 0; - if (rec.getAlleles().equals(this.getReference())) - qual = refQual; - else if (rec.getFields().containsKey("GQ")) - qual = Double.valueOf(rec.getFields().get("GQ")) / 10.0; - int coverage = (rec.getFields().containsKey("RD") ? Integer.valueOf(rec.getFields().get("RD")) : 0); - genotypes.add(new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", rec.getAlleles()), qual, coverage, rec.getSampleName())); - } - return genotypes; - } - - /** - * a private helper method - * - * @return an array in lexigraphical order of the likelihoods - */ - private Genotype getGenotype(DiploidGenotype x) { - if (x.toString().equals(getReference())) - return new BasicGenotype(this.getLocation(), getReference(), this.getReference().charAt(0), 0); - for (VCFGenotypeRecord record : mCurrentRecord.getVCFGenotypeRecords()) { - if (Utils.join("", record.getAlleles()).equals(x.toString())) { - double qual = 0.0; - if (record.getAlleles().equals(this.getReference())) - qual = this.getNegLog10PError(); - else if (record.getFields().containsKey("GQ")) - qual = Double.valueOf(record.getFields().get("GQ")) / 10.0; - int coverage = (record.getFields().containsKey("RD") ? Integer.valueOf(record.getFields().get("RD")) : 0); - return new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", record.getAlleles()), qual, coverage, record.getSampleName()); - } - } - return null; - } - - public VCFHeader getHeader() { - return mReader.getHeader(); + assertNotNull(); + return mCurrentRecord.getGenotypes(); } /** @@ -382,26 +267,25 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, * * @return true if available, false otherwise */ - @Override public boolean hasGenotype(DiploidGenotype x) { - if (getGenotype(x) != null) - return true; - return false; + assertNotNull(); + return mCurrentRecord.hasGenotype(x); + } + + public VCFHeader getHeader() { + return mReader.getHeader(); } - @Override public boolean hasNext() { - return (mReader.hasNext()); + return mReader.hasNext(); } - @Override public RodVCF next() { mCurrentRecord = mReader.next(); - return new RodVCF(this.name, mCurrentRecord, mReader); + return new RodVCF(name, mCurrentRecord, mReader); } - @Override public void remove() { - throw new UnsupportedOperationException("You cannot remove from a VCF rod"); + throw new UnsupportedOperationException("The remove operation is not supported for a VCF rod"); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java index 7aeb787ec..86b0a6cab 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java @@ -77,7 +77,9 @@ public class VCFSubsetWalker extends RefWalker, VCFWriter> private VCFRecord subsetRecord(VCFRecord record) { ArrayList genotypeRecords = new ArrayList(); - for (VCFGenotypeRecord gr : record.getVCFGenotypeRecords()) { + for (int i = 0; i < record.getGenotypes().size(); i++) { + VCFGenotypeRecord gr = (VCFGenotypeRecord)record.getGenotypes().get(i); + //if (gr.getSampleName().equalsIgnoreCase(SAMPLE)) { if (SAMPLES.contains(gr.getSampleName())) { //out.println(gr.getSampleName() + " " + gr.toGenotypeString(record.getAlternateAlleles())); @@ -86,8 +88,8 @@ public class VCFSubsetWalker extends RefWalker, VCFWriter> } VCFRecord subset = new VCFRecord(record.getReferenceBase(), - record.getChromosome(), - (int) record.getPosition(), + record.getLocation().getContig(), + (int) record.getLocation().getStart(), record.getID(), record.getAlternateAlleles(), record.getQual(), @@ -104,7 +106,7 @@ public class VCFSubsetWalker extends RefWalker, VCFWriter> VCFRecord subset = subsetRecord(record); boolean isVariant = false; - for (VCFGenotypeEncoding ge : subset.getVCFGenotypeRecords().get(0).getAlleles()) { + for (VCFGenotypeEncoding ge : ((VCFGenotypeRecord)subset.getGenotypes().get(0)).getAlleles()) { if (record.getReferenceBase() != ge.getBases().charAt(0)) { isVariant = true; } diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java b/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java index 06500ec48..c93f2fa0e 100644 --- a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java +++ b/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java @@ -509,8 +509,8 @@ public class VCFTool public static Interval getIntervalFromRecord(VCFRecord record) { - String chr = record.getChromosome(); - long off = record.getPosition(); + String chr = record.getLocation().getContig(); + long off = record.getLocation().getStart(); return new Interval(chr, (int)off, (int)off); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java index 1adf93399..9da4be99c 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java @@ -5,7 +5,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; /** * @author aaron *

- * Class GenotypeLikelihood + * Class Genotype *

* This class emcompasses all the basic information about a genotype */ diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 87940d3ef..261a97724 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -1,5 +1,8 @@ package org.broadinstitute.sting.utils.genotype.vcf; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotype.*; + import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -11,10 +14,8 @@ import java.util.Map; *

* Class VCFGenotypeRecord *

- * The genotype record in VCF store a considerable amount of information, - * so they were broken off into their own class */ -public class VCFGenotypeRecord { +public class VCFGenotypeRecord implements Genotype { // the symbol for a empty genotype public static final String EMPTY_GENOTYPE = "."; @@ -23,6 +24,9 @@ public class VCFGenotypeRecord { UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN } + // our record + private VCFRecord mRecord; + // our phasing private PHASE mPhaseType; @@ -38,10 +42,10 @@ public class VCFGenotypeRecord { /** * Create a VCF genotype record * - * @param sampleName - * @param genotypes - * @param phasing - * @param otherFlags + * @param sampleName sample name + * @param genotypes list of genotypes + * @param phasing phasing + * @param otherFlags other flags */ public VCFGenotypeRecord(String sampleName, List genotypes, PHASE phasing, Map otherFlags) { this.mSampleName = sampleName; @@ -50,11 +54,16 @@ public class VCFGenotypeRecord { if (otherFlags != null) this.mFields.putAll(otherFlags); } + public void setVCFRecord(VCFRecord record) { + this.mRecord = record; + } /** * determine the phase of the genotype * * @param phase the string that contains the phase character + * + * @return the phase */ static PHASE determinePhase(String phase) { // find the phasing information @@ -68,7 +77,6 @@ public class VCFGenotypeRecord { throw new IllegalArgumentException("Unknown genotype phasing parameter"); } - /** getter methods */ public PHASE getPhaseType() { return mPhaseType; @@ -86,6 +94,64 @@ public class VCFGenotypeRecord { return mFields; } + public double getNegLog10PError() { + return ( mFields.containsKey("GQ") ? Double.valueOf(mFields.get("GQ")) / 10.0 : 0.0); + } + + public GenomeLoc getLocation() { + return mRecord != null ? mRecord.getLocation() : null; + } + + public char getReference() { + return mRecord != null ? mRecord.getReferenceBase() : 'N'; + } + + public Variation toVariation(char ref) { + return mRecord != null ? mRecord : null; + } + + public String getBases() { + String genotype = ""; + for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) + genotype += encoding.getBases(); + return genotype; + } + + public boolean isVariant(char ref) { + for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) { + if ( encoding.getType() == VCFGenotypeEncoding.TYPE.UNCALLED ) + continue; + if ( encoding.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE || + encoding.getBases().charAt(0) != ref ) + return true; + } + return false; + } + + public boolean isPointGenotype() { + return (mRecord != null ? !mRecord.isIndel() : true); + } + + public boolean isHom() { + if ( mGenotypeAlleles.size() == 0 ) + return true; + + String bases = mGenotypeAlleles.get(0).getBases(); + for ( int i = 1; i < mGenotypeAlleles.size(); i++ ) { + if ( !bases.equals(mGenotypeAlleles.get(1).getBases()) ) + return false; + } + return true; + } + + public boolean isHet() { + return !isHom(); + } + + public int getPloidy() { + return 2; + } + private String toGenotypeString(List altAlleles) { String str = ""; boolean first = true; @@ -144,7 +210,6 @@ public class VCFGenotypeRecord { public String toStringEncoding(List altAlleles) { StringBuilder builder = new StringBuilder(); builder.append(toGenotypeString(altAlleles)); - boolean first = true; for (String field : mFields.keySet()) { if (mFields.get(field).equals("")) continue; builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 8e93cd2ea..c93fc025e 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -1,14 +1,16 @@ package org.broadinstitute.sting.utils.genotype.vcf; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.*; import java.util.*; /** * the basic VCF record type */ -public class VCFRecord { +public class VCFRecord implements Variation, VariantBackedByGenotype { + // commonly used strings that are in the standard public static final String FORMAT_FIELD_SEPERATOR = ":"; public static final String GENOTYPE_FIELD_SEPERATOR = ":"; @@ -17,12 +19,11 @@ public class VCFRecord { public static final String INFO_FIELD_SEPERATOR = ";"; public static final String EMPTY_INFO_FIELD = "."; public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f"; + // the reference base private char mReferenceBase; - // our contig - private String mChrome; - // our position - private int mPosition; + // our location + private GenomeLoc mLoc; // our id; set to '.' if not available private String mID; // the alternate bases @@ -37,7 +38,7 @@ public class VCFRecord { private final String mGenotypeFormatString; // our genotype sample fields - private final List mGenotypeFields = new ArrayList(); + private final List mGenotypeFields = new ArrayList(); /** * given a VCF header, and the values for each of the columns, create a VCF record. @@ -63,6 +64,7 @@ public class VCFRecord { * @param qual the qual field * @param filters the filters used on this variant * @param infoFields the information fields + * @param genotypeFormatString the format string * @param genotypeObjects the genotype objects */ public VCFRecord(char referenceBase, @@ -76,8 +78,7 @@ public class VCFRecord { String genotypeFormatString, List genotypeObjects) { this.setReferenceBase(referenceBase); - this.mChrome = contig; - this.setPosition(position); + this.setLocation(contig, position); this.mID = ID; for (VCFGenotypeEncoding alt : altBases) this.addAlternateBase(alt); @@ -85,6 +86,10 @@ public class VCFRecord { this.setFilterString(filters); this.mInfoFields.putAll(infoFields); mGenotypeFormatString = genotypeFormatString; + + // associate the genotypes with this Variation, then add them + for ( VCFGenotypeRecord rec : genotypeObjects ) + rec.setVCFRecord(this); this.mGenotypeFields.addAll(genotypeObjects); } @@ -104,13 +109,16 @@ public class VCFRecord { * @param columnValues a map of the header fields to values */ private void extractFields(Map columnValues) { + String chrom = null; + int position = -1; + for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) { switch (val) { case CHROM: - this.setChomosome(columnValues.get(val)); + chrom = columnValues.get(val); break; case POS: - this.setPosition(Integer.valueOf(columnValues.get(val))); + position = Integer.valueOf(columnValues.get(val)); break; case ID: this.setID(columnValues.get(val)); @@ -146,6 +154,7 @@ public class VCFRecord { break; } } + setLocation(chrom, position); } /** @@ -159,18 +168,10 @@ public class VCFRecord { } /** - * @return the string for the chromosome that this VCF record is associated with + * @return this VCF record's location */ - public String getChromosome() { - return this.mChrome; - } - - - /** - * @return this VCF records position on the specified chromosome - */ - public long getPosition() { - return this.mPosition; + public GenomeLoc getLocation() { + return this.mLoc; } /** @@ -180,13 +181,22 @@ public class VCFRecord { return this.mID; } + /** + * get the reference base + * + * @return either A, T, C, G, or N + */ + public String getReference() { + return Character.toString(mReferenceBase); + } + /** * get the reference base * * @return either A, T, C, G, or N */ public char getReferenceBase() { - return this.mReferenceBase; + return mReferenceBase; } /** @@ -194,6 +204,13 @@ public class VCFRecord { * * @return an array of strings representing the alt alleles, or null if there are none */ + public List getAlternateAlleleList() { + ArrayList alts = new ArrayList(); + for ( VCFGenotypeEncoding alt : mAlts ) + alts.add(alt.getBases()); + return alts; + } + public List getAlternateAlleles() { return this.mAlts; } @@ -202,6 +219,83 @@ public class VCFRecord { return getAlternateAlleles().size() > 0; } + public boolean isBiallelic() { + return getAlternateAlleles().size() == 1; + } + + public boolean isReference() { + return !hasAlternateAllele(); + } + + public List getAlleleList() { + ArrayList list = new ArrayList(); + list.add(getReference()); + list.addAll(getAlternateAlleleList()); + return list; + } + + public double getNonRefAlleleFrequency() { + if ( mInfoFields.containsKey("AF") ) { + return Double.valueOf(mInfoFields.get("AF")); + } else { + // this is the poor man's AF + if ( mInfoFields.containsKey("AC") && mInfoFields.containsKey("AN")) { + String splt[] = mInfoFields.get("AC").split(","); + if ( splt.length > 0 ) { + return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get("AN"))); + } + } + } + + return 0.0; + } + + public VARIANT_TYPE getType() { + if ( !hasAlternateAllele() ) + return VARIANT_TYPE.REFERENCE; + + // TODO -- figure out what to do about records with more than one type + VCFGenotypeEncoding encoding = mAlts.get(0); + switch ( encoding.getType() ) { + case SINGLE_BASE: + return VARIANT_TYPE.SNP; + case DELETION: + return VARIANT_TYPE.DELETION; + case INSERTION: + return VARIANT_TYPE.INSERTION; + } + + throw new IllegalStateException("The record contains unknown genotype encodings"); + } + + public boolean isDeletion() { + return getType() == VARIANT_TYPE.DELETION; + } + + public boolean isInsertion() { + return getType() == VARIANT_TYPE.INSERTION; + } + + public boolean isIndel() { + return isDeletion() || isInsertion(); + } + + public boolean isSNP() { + return getType() == VARIANT_TYPE.SNP; + } + + public char getAlternativeBaseForSNP() { + if ( !isSNP() && !isBiallelic() ) + throw new IllegalStateException("This record does not represent a SNP"); + return mAlts.get(0).getBases().charAt(0); + } + + public char getReferenceForSNP() { + if ( !isSNP() ) + throw new IllegalStateException("This record does not represent a SNP"); + return getReferenceBase(); + } + /** * @return the phred-scaled quality score */ @@ -209,6 +303,13 @@ public class VCFRecord { return this.mQual; } + /** + * @return the -log10PError + */ + public double getNegLog10PError() { + return this.mQual / 10.0; + } + /** * get the filter criteria * @@ -257,13 +358,34 @@ public class VCFRecord { return VCFHeader.HEADER_FIELDS.values().length; } - /** - * return the mapping of the format tags to the specified sample's values - * - * @return a VCFGenotypeRecord - */ public List getVCFGenotypeRecords() { - return this.mGenotypeFields; + ArrayList list = new ArrayList(); + for ( Genotype g : mGenotypeFields ) + list.add((VCFGenotypeRecord)g); + return list; + } + + public List getGenotypes() { + return mGenotypeFields; + } + + public Genotype getCalledGenotype() { + if ( mGenotypeFields == null || mGenotypeFields.size() != 1 ) + throw new IllegalStateException("There is not one and only one genotype associated with this record"); + VCFGenotypeRecord record = (VCFGenotypeRecord)mGenotypeFields.get(0); + if ( record.isEmptyGenotype() ) + return null; + return record; + } + + public boolean hasGenotype(DiploidGenotype x) { + if ( mGenotypeFields == null ) + return false; + for ( Genotype g : mGenotypeFields ) { + if ( DiploidGenotype.valueOf(g.getBases()).equals(x) ) + return true; + } + return false; } /** @@ -271,9 +393,10 @@ public class VCFRecord { */ public String[] getSampleNames() { String names[] = new String[mGenotypeFields.size()]; - int index = 0; - for (VCFGenotypeRecord rec : this.mGenotypeFields) - names[index++] = rec.getSampleName(); + for (int i = 0; i < mGenotypeFields.size(); i++) { + VCFGenotypeRecord rec = (VCFGenotypeRecord)mGenotypeFields.get(i); + names[i] = rec.getSampleName(); + } return names; } @@ -289,14 +412,12 @@ public class VCFRecord { this.mReferenceBase = referenceBase; } - public void setPosition(int mPosition) { - if (mPosition < 0) + public void setLocation(String chrom, int position) { + if ( chrom == null ) + throw new IllegalArgumentException("Chromosomes cannot be missing"); + if ( position < 0 ) throw new IllegalArgumentException("Position values must be greater than 0"); - this.mPosition = mPosition; - } - - public void setChomosome(String mChrome) { - this.mChrome = mChrome; + this.mLoc = GenomeLocParser.createGenomeLoc(chrom, position); } public void setID(String mID) { @@ -369,9 +490,9 @@ public class VCFRecord { StringBuilder builder = new StringBuilder(); // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO - builder.append(getChromosome()); + builder.append(mLoc.getContig()); builder.append(FIELD_SEPERATOR); - builder.append(getPosition()); + builder.append(mLoc.getStart()); builder.append(FIELD_SEPERATOR); builder.append(getID()); builder.append(FIELD_SEPERATOR); @@ -416,10 +537,10 @@ public class VCFRecord { */ private void addGenotypeData(StringBuilder builder, VCFHeader header) { builder.append(FIELD_SEPERATOR + this.getGenotypeFormatString()); - if (header.getGenotypeSamples().size() < getVCFGenotypeRecords().size()) + if (header.getGenotypeSamples().size() < getGenotypes().size()) throw new RuntimeException("We have more genotype samples than the header specified"); - Map gMap = genotypeListToMap(getVCFGenotypeRecords()); + Map gMap = genotypeListToMap(getGenotypes()); for (String genotype : header.getGenotypeSamples()) { builder.append(FIELD_SEPERATOR); @@ -447,8 +568,7 @@ public class VCFRecord { public boolean equals(VCFRecord other) { if (!this.mAlts.equals(other.mAlts)) return false; if (this.mReferenceBase != other.mReferenceBase) return false; - if (!this.mChrome.equals(other.mChrome)) return false; - if (this.mPosition != other.mPosition) return false; + if (!this.mLoc.equals(other.mLoc)) return false; if (!this.mID.equals(other.mID)) return false; if (this.mQual != other.mQual) return false; if (!this.mFilterString.equals(other.mFilterString)) return false; @@ -463,9 +583,10 @@ public class VCFRecord { * @param list a list of genotype samples * @return a mapping of the sample name to VCF genotype record */ - private static Map genotypeListToMap(List list) { + private static Map genotypeListToMap(List list) { Map map = new HashMap(); - for (VCFGenotypeRecord rec : list) { + for (int i = 0; i < list.size(); i++) { + VCFGenotypeRecord rec = (VCFGenotypeRecord)list.get(i); map.put(rec.getSampleName(), rec); } return map; diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java index 6195dbe5a..15f32b443 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java @@ -1,9 +1,13 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.gatk.refdata.RodVCF; import org.junit.Assert; import org.junit.Test; +import org.junit.BeforeClass; import java.io.*; import java.util.ArrayList; @@ -18,6 +22,18 @@ public class VCFReaderTest extends BaseTest { private static final File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf"); private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.vcf"; + private static IndexedFastaSequenceFile seq; + + @BeforeClass + public static void beforeTests() { + try { + seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta")); + } catch (FileNotFoundException e) { + throw new StingException("unable to load the sequence dictionary"); + } + GenomeLocParser.setupRefContigOrdering(seq); + } + @Test public void testVCFInput() { VCFReader reader = new VCFReader(vcfFile); diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java index cd3ff8feb..09a0c8ba7 100755 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java @@ -1,10 +1,16 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.junit.Assert; import org.junit.Test; +import org.junit.BeforeClass; import java.util.*; +import java.io.File; +import java.io.FileNotFoundException; /** @@ -16,6 +22,18 @@ import java.util.*; */ public class VCFRecordTest extends BaseTest { + private static IndexedFastaSequenceFile seq; + + @BeforeClass + public static void beforeTests() { + try { + seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + } catch (FileNotFoundException e) { + throw new StingException("unable to load the sequence dictionary"); + } + GenomeLocParser.setupRefContigOrdering(seq); + } + /** * create a fake VCF record * diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java index 3bce24427..edfd3fca3 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java @@ -1,10 +1,15 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.junit.Assert; import org.junit.Test; +import org.junit.BeforeClass; import java.io.File; +import java.io.FileNotFoundException; import java.util.*; @@ -21,6 +26,18 @@ public class VCFWriterTest extends BaseTest { private Set additionalColumns = new HashSet(); private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); + private static IndexedFastaSequenceFile seq; + + @BeforeClass + public static void beforeTests() { + try { + seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + } catch (FileNotFoundException e) { + throw new StingException("unable to load the sequence dictionary"); + } + GenomeLocParser.setupRefContigOrdering(seq); + } + /** test, using the writer and reader, that we can output and input a VCF file without problems */ @Test public void testBasicWriteAndRead() {