Stage 2 of Variation refactoring:

VCFRecord now implements Variation, VCFGenotypeRecord now implements Genotype.

Because of this change, RodVCF is now just a wrapper around the VCFRecord and does nothing else.  Also, one can call toVariation on the VCFGenotypeRecord and it returns the VCFRecord.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2271 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-06 06:48:03 +00:00
parent 3b440e0dbc
commit b6f8e33f4c
9 changed files with 358 additions and 235 deletions

View File

@ -1,10 +1,7 @@
package org.broadinstitute.sting.gatk.refdata; package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
@ -13,7 +10,6 @@ import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -41,27 +37,30 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
} }
public void assertNotNull() { public void assertNotNull() {
if (mCurrentRecord == null) { if ( mCurrentRecord == null ) {
throw new UnsupportedOperationException("The current Record is null"); throw new UnsupportedOperationException("The current Record is null");
} }
} }
public void assertBiAllelic() {
if ( !isBiallelic() )
throw new StingException("This VCF rod is not bi-allelic.");
}
@Override @Override
public boolean parseLine(Object header, String[] parts) throws IOException { public boolean parseLine(Object header, String[] parts) throws IOException {
throw new UnsupportedOperationException("We don't support the parse line"); throw new UnsupportedOperationException("RodVCF does not support the parseLine method");
} }
public Object initialize(final File source) throws FileNotFoundException { public Object initialize(final File source) throws FileNotFoundException {
if (mReader == null) mReader = new VCFReader(source); if ( mReader == null )
mReader = new VCFReader(source);
return mReader.getHeader(); return mReader.getHeader();
} }
@Override @Override
public String toString() { public String toString() {
if (this.mCurrentRecord != null) return (mCurrentRecord != null ? mCurrentRecord.toStringEncoding(mReader.getHeader()) : "");
return this.mCurrentRecord.toStringEncoding(mReader.getHeader());
else
return "";
} }
public static RodVCF createIterator(String name, File file) { public static RodVCF createIterator(String name, File file) {
@ -74,13 +73,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
return vcf; return vcf;
} }
public void assertBiAllelic() {
if (!this.isBiallelic()) throw new StingException("We're not bi-allelic.");
}
public boolean hasNonRefAlleleFrequency() { public boolean hasNonRefAlleleFrequency() {
return (this.mCurrentRecord.getInfoValues().containsKey("AF") || assertNotNull();
(this.mCurrentRecord.getInfoValues().containsKey("AC") && this.mCurrentRecord.getInfoValues().containsKey("AN"))); return mCurrentRecord.getNonRefAlleleFrequency() > 0.0;
} }
/** /**
@ -88,25 +83,13 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return VariantFrequency with the stored frequency * @return VariantFrequency with the stored frequency
*/ */
@Override
public double getNonRefAlleleFrequency() { public double getNonRefAlleleFrequency() {
assertNotNull(); assertNotNull();
if (this.mCurrentRecord.getInfoValues().containsKey("AF")) { return mCurrentRecord.getNonRefAlleleFrequency();
return Double.valueOf(this.mCurrentRecord.getInfoValues().get("AF"));
} else {
// this is the poor man's AF
if (this.mCurrentRecord.getInfoValues().containsKey("AC") && this.mCurrentRecord.getInfoValues().containsKey("AN")) {
String splt[] = mCurrentRecord.getInfoValues().get("AC").split(",");
if (splt.length > 0) {
return (Double.valueOf(splt[0]) / Double.valueOf(mCurrentRecord.getInfoValues().get("AN")));
}
}
}
return 0.0;
} }
public boolean hasStrandBias() { public boolean hasStrandBias() {
assertNotNull();
return this.mCurrentRecord.getInfoValues().containsKey("SB"); return this.mCurrentRecord.getInfoValues().containsKey("SB");
} }
@ -116,19 +99,13 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* @return StrandBias with the stored slod * @return StrandBias with the stored slod
*/ */
public double getStrandBias() { public double getStrandBias() {
assertNotNull(); return hasStrandBias() ? Double.valueOf(this.mCurrentRecord.getInfoValues().get("SB")) : 0.0;
if (this.mCurrentRecord.getInfoValues().containsKey("SB"))
return Double.valueOf(this.mCurrentRecord.getInfoValues().get("SB"));
return 0.0;
} }
/** @return the VARIANT_TYPE of the current variant */ /** @return the VARIANT_TYPE of the current variant */
@Override
public VARIANT_TYPE getType() { public VARIANT_TYPE getType() {
if (this.isSNP()) return VARIANT_TYPE.SNP; assertNotNull();
else if (this.isInsertion()) return VARIANT_TYPE.INSERTION; return mCurrentRecord.getType();
else if (this.isDeletion()) return VARIANT_TYPE.DELETION;
return VARIANT_TYPE.REFERENCE;
} }
/** /**
@ -136,15 +113,10 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return true if we're a SNP * @return true if we're a SNP
*/ */
@Override
public boolean isSNP() { public boolean isSNP() {
this.assertNotNull(); assertNotNull();
assertBiAllelic(); assertBiAllelic();
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) { return mCurrentRecord.isSNP();
if (alt.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE)
return false;
}
return true;
} }
/** /**
@ -152,17 +124,10 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return true if we are, false otherwise * @return true if we are, false otherwise
*/ */
@Override
public boolean isInsertion() { public boolean isInsertion() {
this.assertNotNull(); assertNotNull();
assertBiAllelic(); assertBiAllelic();
if (!mCurrentRecord.hasAlternateAllele()) return mCurrentRecord.isInsertion();
return false;
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.getType() == VCFGenotypeEncoding.TYPE.INSERTION)
return true;
}
return false;
} }
/** /**
@ -170,23 +135,16 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return true if we are, false otherwise * @return true if we are, false otherwise
*/ */
@Override
public boolean isDeletion() { public boolean isDeletion() {
this.assertNotNull(); assertNotNull();
assertBiAllelic(); assertBiAllelic();
if (!mCurrentRecord.hasAlternateAllele()) return mCurrentRecord.isDeletion();
return false;
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.getType() == VCFGenotypeEncoding.TYPE.DELETION)
return true;
}
return false;
} }
@Override @Override
public GenomeLoc getLocation() { public GenomeLoc getLocation() {
this.assertNotNull(); assertNotNull();
return GenomeLocParser.createGenomeLoc(mCurrentRecord.getChromosome(), mCurrentRecord.getPosition(), mCurrentRecord.getPosition()); return mCurrentRecord.getLocation();
} }
/** /**
@ -194,15 +152,15 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return the reference base or bases, as a string * @return the reference base or bases, as a string
*/ */
@Override
public String getReference() { public String getReference() {
return String.valueOf(mCurrentRecord.getReferenceBase()); assertNotNull();
return mCurrentRecord.getReference();
} }
/** are we bi-allelic? */ /** are we bi-allelic? */
@Override
public boolean isBiallelic() { public boolean isBiallelic() {
return (this.getAlternateAlleleList().size() == 1); assertNotNull();
return mCurrentRecord.isBiallelic();
} }
/** /**
@ -210,10 +168,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return the log based error estimate * @return the log based error estimate
*/ */
@Override
public double getNegLog10PError() { public double getNegLog10PError() {
// we're -10 log(error), we have to divide by 10 assertNotNull();
return mCurrentRecord.getQual() / 10.0; return mCurrentRecord.getNegLog10PError();
} }
/** /**
@ -224,12 +181,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return an alternate allele list * @return an alternate allele list
*/ */
@Override
public List<String> getAlternateAlleleList() { public List<String> getAlternateAlleleList() {
List<String> list = new ArrayList<String>(); assertNotNull();
for (VCFGenotypeEncoding enc : mCurrentRecord.getAlternateAlleles()) return mCurrentRecord.getAlternateAlleleList();
list.add(enc.toString());
return list;
} }
/** /**
@ -239,12 +193,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return an alternate allele list * @return an alternate allele list
*/ */
@Override
public List<String> getAlleleList() { public List<String> getAlleleList() {
List<String> ret = new ArrayList<String>(); assertNotNull();
ret.add(String.valueOf(mCurrentRecord.getReferenceBase())); return mCurrentRecord.getAlleleList();
ret.addAll(getAlternateAlleleList());
return ret;
} }
/** /**
@ -252,9 +203,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return false if we're a variant(indel, delete, SNP, etc), true if we're not * @return false if we're a variant(indel, delete, SNP, etc), true if we're not
*/ */
@Override
public boolean isReference() { public boolean isReference() {
return (!mCurrentRecord.hasAlternateAllele()); assertNotNull();
return mCurrentRecord.isReference();
} }
/** /**
@ -262,9 +213,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return true if we're an insertion or deletion * @return true if we're an insertion or deletion
*/ */
@Override
public boolean isIndel() { public boolean isIndel() {
return (!isSNP()); assertNotNull();
return mCurrentRecord.isIndel();
} }
/** /**
@ -273,11 +224,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return a char, representing the alternate base * @return a char, representing the alternate base
*/ */
@Override
public char getAlternativeBaseForSNP() { public char getAlternativeBaseForSNP() {
if (!isSNP()) throw new IllegalStateException("we're not a SNP"); assertNotNull();
if (mCurrentRecord.getAlternateAlleles().size() != 1) throw new UnsupportedOperationException("We're not a biallelic VCF site"); return mCurrentRecord.getAlternativeBaseForSNP();
return (mCurrentRecord.getAlternateAlleles().get(0).toString()).charAt(0);
} }
/** /**
@ -285,10 +234,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return a char, representing the alternate base * @return a char, representing the alternate base
*/ */
@Override
public char getReferenceForSNP() { public char getReferenceForSNP() {
if (!isSNP()) throw new IllegalStateException("we're not a SNP"); assertNotNull();
return mCurrentRecord.getReferenceBase(); return mCurrentRecord.getReferenceForSNP();
} }
/** /**
@ -296,28 +244,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return a map in lexigraphical order of the genotypes * @return a map in lexigraphical order of the genotypes
*/ */
@Override
public Genotype getCalledGenotype() { public Genotype getCalledGenotype() {
double refQual = (this.getNegLog10PError()); assertNotNull();
return mCurrentRecord.getCalledGenotype();
if (this.mCurrentRecord != null && this.mCurrentRecord.hasGenotypeData()) {
List<VCFGenotypeRecord> lst = this.mCurrentRecord.getVCFGenotypeRecords();
if (lst.size() != 1) {
throw new IllegalStateException("VCF object does not have one and only one genotype record");
}
VCFGenotypeRecord rec = lst.get(0);
if ( rec.isEmptyGenotype() )
return null;
double qual = 0;
if (rec.getAlleles().equals(this.getReference()))
qual = refQual;
else if (rec.getFields().containsKey("GQ"))
qual = Double.valueOf(rec.getFields().get("GQ")) / 10.0;
int coverage = (rec.getFields().containsKey("RD") ? Integer.valueOf(rec.getFields().get("RD")) : 0);
return new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", rec.getAlleles()), qual, coverage, rec.getSampleName());
}
return null;
} }
/** /**
@ -325,53 +254,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return a list of the genotypes * @return a list of the genotypes
*/ */
@Override
public List<Genotype> getGenotypes() { public List<Genotype> getGenotypes() {
List<Genotype> genotypes = new ArrayList<Genotype>(); assertNotNull();
if (!this.mCurrentRecord.hasGenotypeData()) { return mCurrentRecord.getGenotypes();
return genotypes;
}
double refQual = (this.getNegLog10PError());
// add the reference
for (VCFGenotypeRecord rec : mCurrentRecord.getVCFGenotypeRecords()) {
if ( rec.isEmptyGenotype() )
continue;
double qual = 0;
if (rec.getAlleles().equals(this.getReference()))
qual = refQual;
else if (rec.getFields().containsKey("GQ"))
qual = Double.valueOf(rec.getFields().get("GQ")) / 10.0;
int coverage = (rec.getFields().containsKey("RD") ? Integer.valueOf(rec.getFields().get("RD")) : 0);
genotypes.add(new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", rec.getAlleles()), qual, coverage, rec.getSampleName()));
}
return genotypes;
}
/**
* a private helper method
*
* @return an array in lexigraphical order of the likelihoods
*/
private Genotype getGenotype(DiploidGenotype x) {
if (x.toString().equals(getReference()))
return new BasicGenotype(this.getLocation(), getReference(), this.getReference().charAt(0), 0);
for (VCFGenotypeRecord record : mCurrentRecord.getVCFGenotypeRecords()) {
if (Utils.join("", record.getAlleles()).equals(x.toString())) {
double qual = 0.0;
if (record.getAlleles().equals(this.getReference()))
qual = this.getNegLog10PError();
else if (record.getFields().containsKey("GQ"))
qual = Double.valueOf(record.getFields().get("GQ")) / 10.0;
int coverage = (record.getFields().containsKey("RD") ? Integer.valueOf(record.getFields().get("RD")) : 0);
return new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", record.getAlleles()), qual, coverage, record.getSampleName());
}
}
return null;
}
public VCFHeader getHeader() {
return mReader.getHeader();
} }
/** /**
@ -382,26 +267,25 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* *
* @return true if available, false otherwise * @return true if available, false otherwise
*/ */
@Override
public boolean hasGenotype(DiploidGenotype x) { public boolean hasGenotype(DiploidGenotype x) {
if (getGenotype(x) != null) assertNotNull();
return true; return mCurrentRecord.hasGenotype(x);
return false; }
public VCFHeader getHeader() {
return mReader.getHeader();
} }
@Override
public boolean hasNext() { public boolean hasNext() {
return (mReader.hasNext()); return mReader.hasNext();
} }
@Override
public RodVCF next() { public RodVCF next() {
mCurrentRecord = mReader.next(); mCurrentRecord = mReader.next();
return new RodVCF(this.name, mCurrentRecord, mReader); return new RodVCF(name, mCurrentRecord, mReader);
} }
@Override
public void remove() { public void remove() {
throw new UnsupportedOperationException("You cannot remove from a VCF rod"); throw new UnsupportedOperationException("The remove operation is not supported for a VCF rod");
} }
} }

View File

@ -77,7 +77,9 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
private VCFRecord subsetRecord(VCFRecord record) { private VCFRecord subsetRecord(VCFRecord record) {
ArrayList<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>(); ArrayList<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
for (VCFGenotypeRecord gr : record.getVCFGenotypeRecords()) { for (int i = 0; i < record.getGenotypes().size(); i++) {
VCFGenotypeRecord gr = (VCFGenotypeRecord)record.getGenotypes().get(i);
//if (gr.getSampleName().equalsIgnoreCase(SAMPLE)) { //if (gr.getSampleName().equalsIgnoreCase(SAMPLE)) {
if (SAMPLES.contains(gr.getSampleName())) { if (SAMPLES.contains(gr.getSampleName())) {
//out.println(gr.getSampleName() + " " + gr.toGenotypeString(record.getAlternateAlleles())); //out.println(gr.getSampleName() + " " + gr.toGenotypeString(record.getAlternateAlleles()));
@ -86,8 +88,8 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
} }
VCFRecord subset = new VCFRecord(record.getReferenceBase(), VCFRecord subset = new VCFRecord(record.getReferenceBase(),
record.getChromosome(), record.getLocation().getContig(),
(int) record.getPosition(), (int) record.getLocation().getStart(),
record.getID(), record.getID(),
record.getAlternateAlleles(), record.getAlternateAlleles(),
record.getQual(), record.getQual(),
@ -104,7 +106,7 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
VCFRecord subset = subsetRecord(record); VCFRecord subset = subsetRecord(record);
boolean isVariant = false; boolean isVariant = false;
for (VCFGenotypeEncoding ge : subset.getVCFGenotypeRecords().get(0).getAlleles()) { for (VCFGenotypeEncoding ge : ((VCFGenotypeRecord)subset.getGenotypes().get(0)).getAlleles()) {
if (record.getReferenceBase() != ge.getBases().charAt(0)) { if (record.getReferenceBase() != ge.getBases().charAt(0)) {
isVariant = true; isVariant = true;
} }

View File

@ -509,8 +509,8 @@ public class VCFTool
public static Interval getIntervalFromRecord(VCFRecord record) public static Interval getIntervalFromRecord(VCFRecord record)
{ {
String chr = record.getChromosome(); String chr = record.getLocation().getContig();
long off = record.getPosition(); long off = record.getLocation().getStart();
return new Interval(chr, (int)off, (int)off); return new Interval(chr, (int)off, (int)off);
} }

View File

@ -5,7 +5,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
/** /**
* @author aaron * @author aaron
* <p/> * <p/>
* Class GenotypeLikelihood * Class Genotype
* <p/> * <p/>
* This class emcompasses all the basic information about a genotype * This class emcompasses all the basic information about a genotype
*/ */

View File

@ -1,5 +1,8 @@
package org.broadinstitute.sting.utils.genotype.vcf; package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
@ -11,10 +14,8 @@ import java.util.Map;
* <p/> * <p/>
* Class VCFGenotypeRecord * Class VCFGenotypeRecord
* <p/> * <p/>
* The genotype record in VCF store a considerable amount of information,
* so they were broken off into their own class
*/ */
public class VCFGenotypeRecord { public class VCFGenotypeRecord implements Genotype {
// the symbol for a empty genotype // the symbol for a empty genotype
public static final String EMPTY_GENOTYPE = "."; public static final String EMPTY_GENOTYPE = ".";
@ -23,6 +24,9 @@ public class VCFGenotypeRecord {
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
} }
// our record
private VCFRecord mRecord;
// our phasing // our phasing
private PHASE mPhaseType; private PHASE mPhaseType;
@ -38,10 +42,10 @@ public class VCFGenotypeRecord {
/** /**
* Create a VCF genotype record * Create a VCF genotype record
* *
* @param sampleName * @param sampleName sample name
* @param genotypes * @param genotypes list of genotypes
* @param phasing * @param phasing phasing
* @param otherFlags * @param otherFlags other flags
*/ */
public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing, Map<String, String> otherFlags) { public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing, Map<String, String> otherFlags) {
this.mSampleName = sampleName; this.mSampleName = sampleName;
@ -50,11 +54,16 @@ public class VCFGenotypeRecord {
if (otherFlags != null) this.mFields.putAll(otherFlags); if (otherFlags != null) this.mFields.putAll(otherFlags);
} }
public void setVCFRecord(VCFRecord record) {
this.mRecord = record;
}
/** /**
* determine the phase of the genotype * determine the phase of the genotype
* *
* @param phase the string that contains the phase character * @param phase the string that contains the phase character
*
* @return the phase
*/ */
static PHASE determinePhase(String phase) { static PHASE determinePhase(String phase) {
// find the phasing information // find the phasing information
@ -68,7 +77,6 @@ public class VCFGenotypeRecord {
throw new IllegalArgumentException("Unknown genotype phasing parameter"); throw new IllegalArgumentException("Unknown genotype phasing parameter");
} }
/** getter methods */
public PHASE getPhaseType() { public PHASE getPhaseType() {
return mPhaseType; return mPhaseType;
@ -86,6 +94,64 @@ public class VCFGenotypeRecord {
return mFields; return mFields;
} }
public double getNegLog10PError() {
return ( mFields.containsKey("GQ") ? Double.valueOf(mFields.get("GQ")) / 10.0 : 0.0);
}
public GenomeLoc getLocation() {
return mRecord != null ? mRecord.getLocation() : null;
}
public char getReference() {
return mRecord != null ? mRecord.getReferenceBase() : 'N';
}
public Variation toVariation(char ref) {
return mRecord != null ? mRecord : null;
}
public String getBases() {
String genotype = "";
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles )
genotype += encoding.getBases();
return genotype;
}
public boolean isVariant(char ref) {
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
if ( encoding.getType() == VCFGenotypeEncoding.TYPE.UNCALLED )
continue;
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE ||
encoding.getBases().charAt(0) != ref )
return true;
}
return false;
}
public boolean isPointGenotype() {
return (mRecord != null ? !mRecord.isIndel() : true);
}
public boolean isHom() {
if ( mGenotypeAlleles.size() == 0 )
return true;
String bases = mGenotypeAlleles.get(0).getBases();
for ( int i = 1; i < mGenotypeAlleles.size(); i++ ) {
if ( !bases.equals(mGenotypeAlleles.get(1).getBases()) )
return false;
}
return true;
}
public boolean isHet() {
return !isHom();
}
public int getPloidy() {
return 2;
}
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) { private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
String str = ""; String str = "";
boolean first = true; boolean first = true;
@ -144,7 +210,6 @@ public class VCFGenotypeRecord {
public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles) { public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles) {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
builder.append(toGenotypeString(altAlleles)); builder.append(toGenotypeString(altAlleles));
boolean first = true;
for (String field : mFields.keySet()) { for (String field : mFields.keySet()) {
if (mFields.get(field).equals("")) continue; if (mFields.get(field).equals("")) continue;
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR); builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);

View File

@ -1,14 +1,16 @@
package org.broadinstitute.sting.utils.genotype.vcf; package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.*; import java.util.*;
/** /**
* the basic VCF record type * the basic VCF record type
*/ */
public class VCFRecord { public class VCFRecord implements Variation, VariantBackedByGenotype {
// commonly used strings that are in the standard // commonly used strings that are in the standard
public static final String FORMAT_FIELD_SEPERATOR = ":"; public static final String FORMAT_FIELD_SEPERATOR = ":";
public static final String GENOTYPE_FIELD_SEPERATOR = ":"; public static final String GENOTYPE_FIELD_SEPERATOR = ":";
@ -17,12 +19,11 @@ public class VCFRecord {
public static final String INFO_FIELD_SEPERATOR = ";"; public static final String INFO_FIELD_SEPERATOR = ";";
public static final String EMPTY_INFO_FIELD = "."; public static final String EMPTY_INFO_FIELD = ".";
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f"; public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
// the reference base // the reference base
private char mReferenceBase; private char mReferenceBase;
// our contig // our location
private String mChrome; private GenomeLoc mLoc;
// our position
private int mPosition;
// our id; set to '.' if not available // our id; set to '.' if not available
private String mID; private String mID;
// the alternate bases // the alternate bases
@ -37,7 +38,7 @@ public class VCFRecord {
private final String mGenotypeFormatString; private final String mGenotypeFormatString;
// our genotype sample fields // our genotype sample fields
private final List<VCFGenotypeRecord> mGenotypeFields = new ArrayList<VCFGenotypeRecord>(); private final List<Genotype> mGenotypeFields = new ArrayList<Genotype>();
/** /**
* given a VCF header, and the values for each of the columns, create a VCF record. * given a VCF header, and the values for each of the columns, create a VCF record.
@ -63,6 +64,7 @@ public class VCFRecord {
* @param qual the qual field * @param qual the qual field
* @param filters the filters used on this variant * @param filters the filters used on this variant
* @param infoFields the information fields * @param infoFields the information fields
* @param genotypeFormatString the format string
* @param genotypeObjects the genotype objects * @param genotypeObjects the genotype objects
*/ */
public VCFRecord(char referenceBase, public VCFRecord(char referenceBase,
@ -76,8 +78,7 @@ public class VCFRecord {
String genotypeFormatString, String genotypeFormatString,
List<VCFGenotypeRecord> genotypeObjects) { List<VCFGenotypeRecord> genotypeObjects) {
this.setReferenceBase(referenceBase); this.setReferenceBase(referenceBase);
this.mChrome = contig; this.setLocation(contig, position);
this.setPosition(position);
this.mID = ID; this.mID = ID;
for (VCFGenotypeEncoding alt : altBases) for (VCFGenotypeEncoding alt : altBases)
this.addAlternateBase(alt); this.addAlternateBase(alt);
@ -85,6 +86,10 @@ public class VCFRecord {
this.setFilterString(filters); this.setFilterString(filters);
this.mInfoFields.putAll(infoFields); this.mInfoFields.putAll(infoFields);
mGenotypeFormatString = genotypeFormatString; mGenotypeFormatString = genotypeFormatString;
// associate the genotypes with this Variation, then add them
for ( VCFGenotypeRecord rec : genotypeObjects )
rec.setVCFRecord(this);
this.mGenotypeFields.addAll(genotypeObjects); this.mGenotypeFields.addAll(genotypeObjects);
} }
@ -104,13 +109,16 @@ public class VCFRecord {
* @param columnValues a map of the header fields to values * @param columnValues a map of the header fields to values
*/ */
private void extractFields(Map<VCFHeader.HEADER_FIELDS, String> columnValues) { private void extractFields(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
String chrom = null;
int position = -1;
for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) { for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) {
switch (val) { switch (val) {
case CHROM: case CHROM:
this.setChomosome(columnValues.get(val)); chrom = columnValues.get(val);
break; break;
case POS: case POS:
this.setPosition(Integer.valueOf(columnValues.get(val))); position = Integer.valueOf(columnValues.get(val));
break; break;
case ID: case ID:
this.setID(columnValues.get(val)); this.setID(columnValues.get(val));
@ -146,6 +154,7 @@ public class VCFRecord {
break; break;
} }
} }
setLocation(chrom, position);
} }
/** /**
@ -159,18 +168,10 @@ public class VCFRecord {
} }
/** /**
* @return the string for the chromosome that this VCF record is associated with * @return this VCF record's location
*/ */
public String getChromosome() { public GenomeLoc getLocation() {
return this.mChrome; return this.mLoc;
}
/**
* @return this VCF records position on the specified chromosome
*/
public long getPosition() {
return this.mPosition;
} }
/** /**
@ -180,13 +181,22 @@ public class VCFRecord {
return this.mID; return this.mID;
} }
/**
* get the reference base
*
* @return either A, T, C, G, or N
*/
public String getReference() {
return Character.toString(mReferenceBase);
}
/** /**
* get the reference base * get the reference base
* *
* @return either A, T, C, G, or N * @return either A, T, C, G, or N
*/ */
public char getReferenceBase() { public char getReferenceBase() {
return this.mReferenceBase; return mReferenceBase;
} }
/** /**
@ -194,6 +204,13 @@ public class VCFRecord {
* *
* @return an array of strings representing the alt alleles, or null if there are none * @return an array of strings representing the alt alleles, or null if there are none
*/ */
public List<String> getAlternateAlleleList() {
ArrayList<String> alts = new ArrayList<String>();
for ( VCFGenotypeEncoding alt : mAlts )
alts.add(alt.getBases());
return alts;
}
public List<VCFGenotypeEncoding> getAlternateAlleles() { public List<VCFGenotypeEncoding> getAlternateAlleles() {
return this.mAlts; return this.mAlts;
} }
@ -202,6 +219,83 @@ public class VCFRecord {
return getAlternateAlleles().size() > 0; return getAlternateAlleles().size() > 0;
} }
public boolean isBiallelic() {
return getAlternateAlleles().size() == 1;
}
public boolean isReference() {
return !hasAlternateAllele();
}
public List<String> getAlleleList() {
ArrayList<String> list = new ArrayList<String>();
list.add(getReference());
list.addAll(getAlternateAlleleList());
return list;
}
public double getNonRefAlleleFrequency() {
if ( mInfoFields.containsKey("AF") ) {
return Double.valueOf(mInfoFields.get("AF"));
} else {
// this is the poor man's AF
if ( mInfoFields.containsKey("AC") && mInfoFields.containsKey("AN")) {
String splt[] = mInfoFields.get("AC").split(",");
if ( splt.length > 0 ) {
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get("AN")));
}
}
}
return 0.0;
}
public VARIANT_TYPE getType() {
if ( !hasAlternateAllele() )
return VARIANT_TYPE.REFERENCE;
// TODO -- figure out what to do about records with more than one type
VCFGenotypeEncoding encoding = mAlts.get(0);
switch ( encoding.getType() ) {
case SINGLE_BASE:
return VARIANT_TYPE.SNP;
case DELETION:
return VARIANT_TYPE.DELETION;
case INSERTION:
return VARIANT_TYPE.INSERTION;
}
throw new IllegalStateException("The record contains unknown genotype encodings");
}
public boolean isDeletion() {
return getType() == VARIANT_TYPE.DELETION;
}
public boolean isInsertion() {
return getType() == VARIANT_TYPE.INSERTION;
}
public boolean isIndel() {
return isDeletion() || isInsertion();
}
public boolean isSNP() {
return getType() == VARIANT_TYPE.SNP;
}
public char getAlternativeBaseForSNP() {
if ( !isSNP() && !isBiallelic() )
throw new IllegalStateException("This record does not represent a SNP");
return mAlts.get(0).getBases().charAt(0);
}
public char getReferenceForSNP() {
if ( !isSNP() )
throw new IllegalStateException("This record does not represent a SNP");
return getReferenceBase();
}
/** /**
* @return the phred-scaled quality score * @return the phred-scaled quality score
*/ */
@ -209,6 +303,13 @@ public class VCFRecord {
return this.mQual; return this.mQual;
} }
/**
* @return the -log10PError
*/
public double getNegLog10PError() {
return this.mQual / 10.0;
}
/** /**
* get the filter criteria * get the filter criteria
* *
@ -257,13 +358,34 @@ public class VCFRecord {
return VCFHeader.HEADER_FIELDS.values().length; return VCFHeader.HEADER_FIELDS.values().length;
} }
/**
* return the mapping of the format tags to the specified sample's values
*
* @return a VCFGenotypeRecord
*/
public List<VCFGenotypeRecord> getVCFGenotypeRecords() { public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
return this.mGenotypeFields; ArrayList<VCFGenotypeRecord> list = new ArrayList<VCFGenotypeRecord>();
for ( Genotype g : mGenotypeFields )
list.add((VCFGenotypeRecord)g);
return list;
}
public List<Genotype> getGenotypes() {
return mGenotypeFields;
}
public Genotype getCalledGenotype() {
if ( mGenotypeFields == null || mGenotypeFields.size() != 1 )
throw new IllegalStateException("There is not one and only one genotype associated with this record");
VCFGenotypeRecord record = (VCFGenotypeRecord)mGenotypeFields.get(0);
if ( record.isEmptyGenotype() )
return null;
return record;
}
public boolean hasGenotype(DiploidGenotype x) {
if ( mGenotypeFields == null )
return false;
for ( Genotype g : mGenotypeFields ) {
if ( DiploidGenotype.valueOf(g.getBases()).equals(x) )
return true;
}
return false;
} }
/** /**
@ -271,9 +393,10 @@ public class VCFRecord {
*/ */
public String[] getSampleNames() { public String[] getSampleNames() {
String names[] = new String[mGenotypeFields.size()]; String names[] = new String[mGenotypeFields.size()];
int index = 0; for (int i = 0; i < mGenotypeFields.size(); i++) {
for (VCFGenotypeRecord rec : this.mGenotypeFields) VCFGenotypeRecord rec = (VCFGenotypeRecord)mGenotypeFields.get(i);
names[index++] = rec.getSampleName(); names[i] = rec.getSampleName();
}
return names; return names;
} }
@ -289,14 +412,12 @@ public class VCFRecord {
this.mReferenceBase = referenceBase; this.mReferenceBase = referenceBase;
} }
public void setPosition(int mPosition) { public void setLocation(String chrom, int position) {
if (mPosition < 0) if ( chrom == null )
throw new IllegalArgumentException("Chromosomes cannot be missing");
if ( position < 0 )
throw new IllegalArgumentException("Position values must be greater than 0"); throw new IllegalArgumentException("Position values must be greater than 0");
this.mPosition = mPosition; this.mLoc = GenomeLocParser.createGenomeLoc(chrom, position);
}
public void setChomosome(String mChrome) {
this.mChrome = mChrome;
} }
public void setID(String mID) { public void setID(String mID) {
@ -369,9 +490,9 @@ public class VCFRecord {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
builder.append(getChromosome()); builder.append(mLoc.getContig());
builder.append(FIELD_SEPERATOR); builder.append(FIELD_SEPERATOR);
builder.append(getPosition()); builder.append(mLoc.getStart());
builder.append(FIELD_SEPERATOR); builder.append(FIELD_SEPERATOR);
builder.append(getID()); builder.append(getID());
builder.append(FIELD_SEPERATOR); builder.append(FIELD_SEPERATOR);
@ -416,10 +537,10 @@ public class VCFRecord {
*/ */
private void addGenotypeData(StringBuilder builder, VCFHeader header) { private void addGenotypeData(StringBuilder builder, VCFHeader header) {
builder.append(FIELD_SEPERATOR + this.getGenotypeFormatString()); builder.append(FIELD_SEPERATOR + this.getGenotypeFormatString());
if (header.getGenotypeSamples().size() < getVCFGenotypeRecords().size()) if (header.getGenotypeSamples().size() < getGenotypes().size())
throw new RuntimeException("We have more genotype samples than the header specified"); throw new RuntimeException("We have more genotype samples than the header specified");
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getVCFGenotypeRecords()); Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
for (String genotype : header.getGenotypeSamples()) { for (String genotype : header.getGenotypeSamples()) {
builder.append(FIELD_SEPERATOR); builder.append(FIELD_SEPERATOR);
@ -447,8 +568,7 @@ public class VCFRecord {
public boolean equals(VCFRecord other) { public boolean equals(VCFRecord other) {
if (!this.mAlts.equals(other.mAlts)) return false; if (!this.mAlts.equals(other.mAlts)) return false;
if (this.mReferenceBase != other.mReferenceBase) return false; if (this.mReferenceBase != other.mReferenceBase) return false;
if (!this.mChrome.equals(other.mChrome)) return false; if (!this.mLoc.equals(other.mLoc)) return false;
if (this.mPosition != other.mPosition) return false;
if (!this.mID.equals(other.mID)) return false; if (!this.mID.equals(other.mID)) return false;
if (this.mQual != other.mQual) return false; if (this.mQual != other.mQual) return false;
if (!this.mFilterString.equals(other.mFilterString)) return false; if (!this.mFilterString.equals(other.mFilterString)) return false;
@ -463,9 +583,10 @@ public class VCFRecord {
* @param list a list of genotype samples * @param list a list of genotype samples
* @return a mapping of the sample name to VCF genotype record * @return a mapping of the sample name to VCF genotype record
*/ */
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) { private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<Genotype> list) {
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>(); Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
for (VCFGenotypeRecord rec : list) { for (int i = 0; i < list.size(); i++) {
VCFGenotypeRecord rec = (VCFGenotypeRecord)list.get(i);
map.put(rec.getSampleName(), rec); map.put(rec.getSampleName(), rec);
} }
return map; return map;

View File

@ -1,9 +1,13 @@
package org.broadinstitute.sting.utils.genotype.vcf; package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.gatk.refdata.RodVCF; import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import org.junit.BeforeClass;
import java.io.*; import java.io.*;
import java.util.ArrayList; import java.util.ArrayList;
@ -18,6 +22,18 @@ public class VCFReaderTest extends BaseTest {
private static final File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf"); private static final File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf");
private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.vcf"; private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.vcf";
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
@Test @Test
public void testVCFInput() { public void testVCFInput() {
VCFReader reader = new VCFReader(vcfFile); VCFReader reader = new VCFReader(vcfFile);

View File

@ -1,10 +1,16 @@
package org.broadinstitute.sting.utils.genotype.vcf; package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import org.junit.BeforeClass;
import java.util.*; import java.util.*;
import java.io.File;
import java.io.FileNotFoundException;
/** /**
@ -16,6 +22,18 @@ import java.util.*;
*/ */
public class VCFRecordTest extends BaseTest { public class VCFRecordTest extends BaseTest {
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/** /**
* create a fake VCF record * create a fake VCF record
* *

View File

@ -1,10 +1,15 @@
package org.broadinstitute.sting.utils.genotype.vcf; package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import org.junit.BeforeClass;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException;
import java.util.*; import java.util.*;
@ -21,6 +26,18 @@ public class VCFWriterTest extends BaseTest {
private Set<String> additionalColumns = new HashSet<String>(); private Set<String> additionalColumns = new HashSet<String>();
private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf");
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/** test, using the writer and reader, that we can output and input a VCF file without problems */ /** test, using the writer and reader, that we can output and input a VCF file without problems */
@Test @Test
public void testBasicWriteAndRead() { public void testBasicWriteAndRead() {