Stage 2 of Variation refactoring:

VCFRecord now implements Variation, VCFGenotypeRecord now implements Genotype.

Because of this change, RodVCF is now just a wrapper around the VCFRecord and does nothing else.  Also, one can call toVariation on the VCFGenotypeRecord and it returns the VCFRecord.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2271 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-06 06:48:03 +00:00
parent 3b440e0dbc
commit b6f8e33f4c
9 changed files with 358 additions and 235 deletions

View File

@ -1,10 +1,7 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
@ -13,7 +10,6 @@ import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@ -41,27 +37,30 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
}
public void assertNotNull() {
if (mCurrentRecord == null) {
if ( mCurrentRecord == null ) {
throw new UnsupportedOperationException("The current Record is null");
}
}
public void assertBiAllelic() {
if ( !isBiallelic() )
throw new StingException("This VCF rod is not bi-allelic.");
}
@Override
public boolean parseLine(Object header, String[] parts) throws IOException {
throw new UnsupportedOperationException("We don't support the parse line");
throw new UnsupportedOperationException("RodVCF does not support the parseLine method");
}
public Object initialize(final File source) throws FileNotFoundException {
if (mReader == null) mReader = new VCFReader(source);
if ( mReader == null )
mReader = new VCFReader(source);
return mReader.getHeader();
}
@Override
public String toString() {
if (this.mCurrentRecord != null)
return this.mCurrentRecord.toStringEncoding(mReader.getHeader());
else
return "";
return (mCurrentRecord != null ? mCurrentRecord.toStringEncoding(mReader.getHeader()) : "");
}
public static RodVCF createIterator(String name, File file) {
@ -74,13 +73,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
return vcf;
}
public void assertBiAllelic() {
if (!this.isBiallelic()) throw new StingException("We're not bi-allelic.");
}
public boolean hasNonRefAlleleFrequency() {
return (this.mCurrentRecord.getInfoValues().containsKey("AF") ||
(this.mCurrentRecord.getInfoValues().containsKey("AC") && this.mCurrentRecord.getInfoValues().containsKey("AN")));
assertNotNull();
return mCurrentRecord.getNonRefAlleleFrequency() > 0.0;
}
/**
@ -88,25 +83,13 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return VariantFrequency with the stored frequency
*/
@Override
public double getNonRefAlleleFrequency() {
assertNotNull();
if (this.mCurrentRecord.getInfoValues().containsKey("AF")) {
return Double.valueOf(this.mCurrentRecord.getInfoValues().get("AF"));
} else {
// this is the poor man's AF
if (this.mCurrentRecord.getInfoValues().containsKey("AC") && this.mCurrentRecord.getInfoValues().containsKey("AN")) {
String splt[] = mCurrentRecord.getInfoValues().get("AC").split(",");
if (splt.length > 0) {
return (Double.valueOf(splt[0]) / Double.valueOf(mCurrentRecord.getInfoValues().get("AN")));
}
}
}
return 0.0;
return mCurrentRecord.getNonRefAlleleFrequency();
}
public boolean hasStrandBias() {
assertNotNull();
return this.mCurrentRecord.getInfoValues().containsKey("SB");
}
@ -116,19 +99,13 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
* @return StrandBias with the stored slod
*/
public double getStrandBias() {
assertNotNull();
if (this.mCurrentRecord.getInfoValues().containsKey("SB"))
return Double.valueOf(this.mCurrentRecord.getInfoValues().get("SB"));
return 0.0;
return hasStrandBias() ? Double.valueOf(this.mCurrentRecord.getInfoValues().get("SB")) : 0.0;
}
/** @return the VARIANT_TYPE of the current variant */
@Override
public VARIANT_TYPE getType() {
if (this.isSNP()) return VARIANT_TYPE.SNP;
else if (this.isInsertion()) return VARIANT_TYPE.INSERTION;
else if (this.isDeletion()) return VARIANT_TYPE.DELETION;
return VARIANT_TYPE.REFERENCE;
assertNotNull();
return mCurrentRecord.getType();
}
/**
@ -136,15 +113,10 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return true if we're a SNP
*/
@Override
public boolean isSNP() {
this.assertNotNull();
assertNotNull();
assertBiAllelic();
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE)
return false;
}
return true;
return mCurrentRecord.isSNP();
}
/**
@ -152,17 +124,10 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return true if we are, false otherwise
*/
@Override
public boolean isInsertion() {
this.assertNotNull();
assertNotNull();
assertBiAllelic();
if (!mCurrentRecord.hasAlternateAllele())
return false;
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.getType() == VCFGenotypeEncoding.TYPE.INSERTION)
return true;
}
return false;
return mCurrentRecord.isInsertion();
}
/**
@ -170,23 +135,16 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return true if we are, false otherwise
*/
@Override
public boolean isDeletion() {
this.assertNotNull();
assertNotNull();
assertBiAllelic();
if (!mCurrentRecord.hasAlternateAllele())
return false;
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.getType() == VCFGenotypeEncoding.TYPE.DELETION)
return true;
}
return false;
return mCurrentRecord.isDeletion();
}
@Override
public GenomeLoc getLocation() {
this.assertNotNull();
return GenomeLocParser.createGenomeLoc(mCurrentRecord.getChromosome(), mCurrentRecord.getPosition(), mCurrentRecord.getPosition());
assertNotNull();
return mCurrentRecord.getLocation();
}
/**
@ -194,15 +152,15 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return the reference base or bases, as a string
*/
@Override
public String getReference() {
return String.valueOf(mCurrentRecord.getReferenceBase());
assertNotNull();
return mCurrentRecord.getReference();
}
/** are we bi-allelic? */
@Override
public boolean isBiallelic() {
return (this.getAlternateAlleleList().size() == 1);
assertNotNull();
return mCurrentRecord.isBiallelic();
}
/**
@ -210,10 +168,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return the log based error estimate
*/
@Override
public double getNegLog10PError() {
// we're -10 log(error), we have to divide by 10
return mCurrentRecord.getQual() / 10.0;
assertNotNull();
return mCurrentRecord.getNegLog10PError();
}
/**
@ -224,12 +181,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return an alternate allele list
*/
@Override
public List<String> getAlternateAlleleList() {
List<String> list = new ArrayList<String>();
for (VCFGenotypeEncoding enc : mCurrentRecord.getAlternateAlleles())
list.add(enc.toString());
return list;
assertNotNull();
return mCurrentRecord.getAlternateAlleleList();
}
/**
@ -239,12 +193,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return an alternate allele list
*/
@Override
public List<String> getAlleleList() {
List<String> ret = new ArrayList<String>();
ret.add(String.valueOf(mCurrentRecord.getReferenceBase()));
ret.addAll(getAlternateAlleleList());
return ret;
assertNotNull();
return mCurrentRecord.getAlleleList();
}
/**
@ -252,9 +203,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return false if we're a variant(indel, delete, SNP, etc), true if we're not
*/
@Override
public boolean isReference() {
return (!mCurrentRecord.hasAlternateAllele());
assertNotNull();
return mCurrentRecord.isReference();
}
/**
@ -262,9 +213,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return true if we're an insertion or deletion
*/
@Override
public boolean isIndel() {
return (!isSNP());
assertNotNull();
return mCurrentRecord.isIndel();
}
/**
@ -273,11 +224,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return a char, representing the alternate base
*/
@Override
public char getAlternativeBaseForSNP() {
if (!isSNP()) throw new IllegalStateException("we're not a SNP");
if (mCurrentRecord.getAlternateAlleles().size() != 1) throw new UnsupportedOperationException("We're not a biallelic VCF site");
return (mCurrentRecord.getAlternateAlleles().get(0).toString()).charAt(0);
assertNotNull();
return mCurrentRecord.getAlternativeBaseForSNP();
}
/**
@ -285,10 +234,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return a char, representing the alternate base
*/
@Override
public char getReferenceForSNP() {
if (!isSNP()) throw new IllegalStateException("we're not a SNP");
return mCurrentRecord.getReferenceBase();
assertNotNull();
return mCurrentRecord.getReferenceForSNP();
}
/**
@ -296,28 +244,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return a map in lexigraphical order of the genotypes
*/
@Override
public Genotype getCalledGenotype() {
double refQual = (this.getNegLog10PError());
if (this.mCurrentRecord != null && this.mCurrentRecord.hasGenotypeData()) {
List<VCFGenotypeRecord> lst = this.mCurrentRecord.getVCFGenotypeRecords();
if (lst.size() != 1) {
throw new IllegalStateException("VCF object does not have one and only one genotype record");
}
VCFGenotypeRecord rec = lst.get(0);
if ( rec.isEmptyGenotype() )
return null;
double qual = 0;
if (rec.getAlleles().equals(this.getReference()))
qual = refQual;
else if (rec.getFields().containsKey("GQ"))
qual = Double.valueOf(rec.getFields().get("GQ")) / 10.0;
int coverage = (rec.getFields().containsKey("RD") ? Integer.valueOf(rec.getFields().get("RD")) : 0);
return new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", rec.getAlleles()), qual, coverage, rec.getSampleName());
}
return null;
assertNotNull();
return mCurrentRecord.getCalledGenotype();
}
/**
@ -325,53 +254,9 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return a list of the genotypes
*/
@Override
public List<Genotype> getGenotypes() {
List<Genotype> genotypes = new ArrayList<Genotype>();
if (!this.mCurrentRecord.hasGenotypeData()) {
return genotypes;
}
double refQual = (this.getNegLog10PError());
// add the reference
for (VCFGenotypeRecord rec : mCurrentRecord.getVCFGenotypeRecords()) {
if ( rec.isEmptyGenotype() )
continue;
double qual = 0;
if (rec.getAlleles().equals(this.getReference()))
qual = refQual;
else if (rec.getFields().containsKey("GQ"))
qual = Double.valueOf(rec.getFields().get("GQ")) / 10.0;
int coverage = (rec.getFields().containsKey("RD") ? Integer.valueOf(rec.getFields().get("RD")) : 0);
genotypes.add(new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", rec.getAlleles()), qual, coverage, rec.getSampleName()));
}
return genotypes;
}
/**
* a private helper method
*
* @return an array in lexigraphical order of the likelihoods
*/
private Genotype getGenotype(DiploidGenotype x) {
if (x.toString().equals(getReference()))
return new BasicGenotype(this.getLocation(), getReference(), this.getReference().charAt(0), 0);
for (VCFGenotypeRecord record : mCurrentRecord.getVCFGenotypeRecords()) {
if (Utils.join("", record.getAlleles()).equals(x.toString())) {
double qual = 0.0;
if (record.getAlleles().equals(this.getReference()))
qual = this.getNegLog10PError();
else if (record.getFields().containsKey("GQ"))
qual = Double.valueOf(record.getFields().get("GQ")) / 10.0;
int coverage = (record.getFields().containsKey("RD") ? Integer.valueOf(record.getFields().get("RD")) : 0);
return new VCFGenotypeCall(this.getReference().charAt(0), this.getLocation(), Utils.join("", record.getAlleles()), qual, coverage, record.getSampleName());
}
}
return null;
}
public VCFHeader getHeader() {
return mReader.getHeader();
assertNotNull();
return mCurrentRecord.getGenotypes();
}
/**
@ -382,26 +267,25 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*
* @return true if available, false otherwise
*/
@Override
public boolean hasGenotype(DiploidGenotype x) {
if (getGenotype(x) != null)
return true;
return false;
assertNotNull();
return mCurrentRecord.hasGenotype(x);
}
public VCFHeader getHeader() {
return mReader.getHeader();
}
@Override
public boolean hasNext() {
return (mReader.hasNext());
return mReader.hasNext();
}
@Override
public RodVCF next() {
mCurrentRecord = mReader.next();
return new RodVCF(this.name, mCurrentRecord, mReader);
return new RodVCF(name, mCurrentRecord, mReader);
}
@Override
public void remove() {
throw new UnsupportedOperationException("You cannot remove from a VCF rod");
throw new UnsupportedOperationException("The remove operation is not supported for a VCF rod");
}
}

View File

@ -77,7 +77,9 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
private VCFRecord subsetRecord(VCFRecord record) {
ArrayList<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
for (VCFGenotypeRecord gr : record.getVCFGenotypeRecords()) {
for (int i = 0; i < record.getGenotypes().size(); i++) {
VCFGenotypeRecord gr = (VCFGenotypeRecord)record.getGenotypes().get(i);
//if (gr.getSampleName().equalsIgnoreCase(SAMPLE)) {
if (SAMPLES.contains(gr.getSampleName())) {
//out.println(gr.getSampleName() + " " + gr.toGenotypeString(record.getAlternateAlleles()));
@ -86,8 +88,8 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
}
VCFRecord subset = new VCFRecord(record.getReferenceBase(),
record.getChromosome(),
(int) record.getPosition(),
record.getLocation().getContig(),
(int) record.getLocation().getStart(),
record.getID(),
record.getAlternateAlleles(),
record.getQual(),
@ -104,7 +106,7 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
VCFRecord subset = subsetRecord(record);
boolean isVariant = false;
for (VCFGenotypeEncoding ge : subset.getVCFGenotypeRecords().get(0).getAlleles()) {
for (VCFGenotypeEncoding ge : ((VCFGenotypeRecord)subset.getGenotypes().get(0)).getAlleles()) {
if (record.getReferenceBase() != ge.getBases().charAt(0)) {
isVariant = true;
}

View File

@ -509,8 +509,8 @@ public class VCFTool
public static Interval getIntervalFromRecord(VCFRecord record)
{
String chr = record.getChromosome();
long off = record.getPosition();
String chr = record.getLocation().getContig();
long off = record.getLocation().getStart();
return new Interval(chr, (int)off, (int)off);
}

View File

@ -5,7 +5,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
/**
* @author aaron
* <p/>
* Class GenotypeLikelihood
* Class Genotype
* <p/>
* This class emcompasses all the basic information about a genotype
*/

View File

@ -1,5 +1,8 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -11,10 +14,8 @@ import java.util.Map;
* <p/>
* Class VCFGenotypeRecord
* <p/>
* The genotype record in VCF store a considerable amount of information,
* so they were broken off into their own class
*/
public class VCFGenotypeRecord {
public class VCFGenotypeRecord implements Genotype {
// the symbol for a empty genotype
public static final String EMPTY_GENOTYPE = ".";
@ -23,6 +24,9 @@ public class VCFGenotypeRecord {
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
}
// our record
private VCFRecord mRecord;
// our phasing
private PHASE mPhaseType;
@ -38,10 +42,10 @@ public class VCFGenotypeRecord {
/**
* Create a VCF genotype record
*
* @param sampleName
* @param genotypes
* @param phasing
* @param otherFlags
* @param sampleName sample name
* @param genotypes list of genotypes
* @param phasing phasing
* @param otherFlags other flags
*/
public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing, Map<String, String> otherFlags) {
this.mSampleName = sampleName;
@ -50,11 +54,16 @@ public class VCFGenotypeRecord {
if (otherFlags != null) this.mFields.putAll(otherFlags);
}
public void setVCFRecord(VCFRecord record) {
this.mRecord = record;
}
/**
* determine the phase of the genotype
*
* @param phase the string that contains the phase character
*
* @return the phase
*/
static PHASE determinePhase(String phase) {
// find the phasing information
@ -68,7 +77,6 @@ public class VCFGenotypeRecord {
throw new IllegalArgumentException("Unknown genotype phasing parameter");
}
/** getter methods */
public PHASE getPhaseType() {
return mPhaseType;
@ -86,6 +94,64 @@ public class VCFGenotypeRecord {
return mFields;
}
public double getNegLog10PError() {
return ( mFields.containsKey("GQ") ? Double.valueOf(mFields.get("GQ")) / 10.0 : 0.0);
}
public GenomeLoc getLocation() {
return mRecord != null ? mRecord.getLocation() : null;
}
public char getReference() {
return mRecord != null ? mRecord.getReferenceBase() : 'N';
}
public Variation toVariation(char ref) {
return mRecord != null ? mRecord : null;
}
public String getBases() {
String genotype = "";
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles )
genotype += encoding.getBases();
return genotype;
}
public boolean isVariant(char ref) {
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
if ( encoding.getType() == VCFGenotypeEncoding.TYPE.UNCALLED )
continue;
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE ||
encoding.getBases().charAt(0) != ref )
return true;
}
return false;
}
public boolean isPointGenotype() {
return (mRecord != null ? !mRecord.isIndel() : true);
}
public boolean isHom() {
if ( mGenotypeAlleles.size() == 0 )
return true;
String bases = mGenotypeAlleles.get(0).getBases();
for ( int i = 1; i < mGenotypeAlleles.size(); i++ ) {
if ( !bases.equals(mGenotypeAlleles.get(1).getBases()) )
return false;
}
return true;
}
public boolean isHet() {
return !isHom();
}
public int getPloidy() {
return 2;
}
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
String str = "";
boolean first = true;
@ -144,7 +210,6 @@ public class VCFGenotypeRecord {
public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles) {
StringBuilder builder = new StringBuilder();
builder.append(toGenotypeString(altAlleles));
boolean first = true;
for (String field : mFields.keySet()) {
if (mFields.get(field).equals("")) continue;
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);

View File

@ -1,14 +1,16 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.*;
/**
* the basic VCF record type
*/
public class VCFRecord {
public class VCFRecord implements Variation, VariantBackedByGenotype {
// commonly used strings that are in the standard
public static final String FORMAT_FIELD_SEPERATOR = ":";
public static final String GENOTYPE_FIELD_SEPERATOR = ":";
@ -17,12 +19,11 @@ public class VCFRecord {
public static final String INFO_FIELD_SEPERATOR = ";";
public static final String EMPTY_INFO_FIELD = ".";
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
// the reference base
private char mReferenceBase;
// our contig
private String mChrome;
// our position
private int mPosition;
// our location
private GenomeLoc mLoc;
// our id; set to '.' if not available
private String mID;
// the alternate bases
@ -37,7 +38,7 @@ public class VCFRecord {
private final String mGenotypeFormatString;
// our genotype sample fields
private final List<VCFGenotypeRecord> mGenotypeFields = new ArrayList<VCFGenotypeRecord>();
private final List<Genotype> mGenotypeFields = new ArrayList<Genotype>();
/**
* given a VCF header, and the values for each of the columns, create a VCF record.
@ -63,6 +64,7 @@ public class VCFRecord {
* @param qual the qual field
* @param filters the filters used on this variant
* @param infoFields the information fields
* @param genotypeFormatString the format string
* @param genotypeObjects the genotype objects
*/
public VCFRecord(char referenceBase,
@ -76,8 +78,7 @@ public class VCFRecord {
String genotypeFormatString,
List<VCFGenotypeRecord> genotypeObjects) {
this.setReferenceBase(referenceBase);
this.mChrome = contig;
this.setPosition(position);
this.setLocation(contig, position);
this.mID = ID;
for (VCFGenotypeEncoding alt : altBases)
this.addAlternateBase(alt);
@ -85,6 +86,10 @@ public class VCFRecord {
this.setFilterString(filters);
this.mInfoFields.putAll(infoFields);
mGenotypeFormatString = genotypeFormatString;
// associate the genotypes with this Variation, then add them
for ( VCFGenotypeRecord rec : genotypeObjects )
rec.setVCFRecord(this);
this.mGenotypeFields.addAll(genotypeObjects);
}
@ -104,13 +109,16 @@ public class VCFRecord {
* @param columnValues a map of the header fields to values
*/
private void extractFields(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
String chrom = null;
int position = -1;
for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) {
switch (val) {
case CHROM:
this.setChomosome(columnValues.get(val));
chrom = columnValues.get(val);
break;
case POS:
this.setPosition(Integer.valueOf(columnValues.get(val)));
position = Integer.valueOf(columnValues.get(val));
break;
case ID:
this.setID(columnValues.get(val));
@ -146,6 +154,7 @@ public class VCFRecord {
break;
}
}
setLocation(chrom, position);
}
/**
@ -159,18 +168,10 @@ public class VCFRecord {
}
/**
* @return the string for the chromosome that this VCF record is associated with
* @return this VCF record's location
*/
public String getChromosome() {
return this.mChrome;
}
/**
* @return this VCF records position on the specified chromosome
*/
public long getPosition() {
return this.mPosition;
public GenomeLoc getLocation() {
return this.mLoc;
}
/**
@ -180,13 +181,22 @@ public class VCFRecord {
return this.mID;
}
/**
* get the reference base
*
* @return either A, T, C, G, or N
*/
public String getReference() {
return Character.toString(mReferenceBase);
}
/**
* get the reference base
*
* @return either A, T, C, G, or N
*/
public char getReferenceBase() {
return this.mReferenceBase;
return mReferenceBase;
}
/**
@ -194,6 +204,13 @@ public class VCFRecord {
*
* @return an array of strings representing the alt alleles, or null if there are none
*/
public List<String> getAlternateAlleleList() {
ArrayList<String> alts = new ArrayList<String>();
for ( VCFGenotypeEncoding alt : mAlts )
alts.add(alt.getBases());
return alts;
}
public List<VCFGenotypeEncoding> getAlternateAlleles() {
return this.mAlts;
}
@ -202,6 +219,83 @@ public class VCFRecord {
return getAlternateAlleles().size() > 0;
}
public boolean isBiallelic() {
return getAlternateAlleles().size() == 1;
}
public boolean isReference() {
return !hasAlternateAllele();
}
public List<String> getAlleleList() {
ArrayList<String> list = new ArrayList<String>();
list.add(getReference());
list.addAll(getAlternateAlleleList());
return list;
}
public double getNonRefAlleleFrequency() {
if ( mInfoFields.containsKey("AF") ) {
return Double.valueOf(mInfoFields.get("AF"));
} else {
// this is the poor man's AF
if ( mInfoFields.containsKey("AC") && mInfoFields.containsKey("AN")) {
String splt[] = mInfoFields.get("AC").split(",");
if ( splt.length > 0 ) {
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get("AN")));
}
}
}
return 0.0;
}
public VARIANT_TYPE getType() {
if ( !hasAlternateAllele() )
return VARIANT_TYPE.REFERENCE;
// TODO -- figure out what to do about records with more than one type
VCFGenotypeEncoding encoding = mAlts.get(0);
switch ( encoding.getType() ) {
case SINGLE_BASE:
return VARIANT_TYPE.SNP;
case DELETION:
return VARIANT_TYPE.DELETION;
case INSERTION:
return VARIANT_TYPE.INSERTION;
}
throw new IllegalStateException("The record contains unknown genotype encodings");
}
public boolean isDeletion() {
return getType() == VARIANT_TYPE.DELETION;
}
public boolean isInsertion() {
return getType() == VARIANT_TYPE.INSERTION;
}
public boolean isIndel() {
return isDeletion() || isInsertion();
}
public boolean isSNP() {
return getType() == VARIANT_TYPE.SNP;
}
public char getAlternativeBaseForSNP() {
if ( !isSNP() && !isBiallelic() )
throw new IllegalStateException("This record does not represent a SNP");
return mAlts.get(0).getBases().charAt(0);
}
public char getReferenceForSNP() {
if ( !isSNP() )
throw new IllegalStateException("This record does not represent a SNP");
return getReferenceBase();
}
/**
* @return the phred-scaled quality score
*/
@ -209,6 +303,13 @@ public class VCFRecord {
return this.mQual;
}
/**
* @return the -log10PError
*/
public double getNegLog10PError() {
return this.mQual / 10.0;
}
/**
* get the filter criteria
*
@ -257,13 +358,34 @@ public class VCFRecord {
return VCFHeader.HEADER_FIELDS.values().length;
}
/**
* return the mapping of the format tags to the specified sample's values
*
* @return a VCFGenotypeRecord
*/
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
return this.mGenotypeFields;
ArrayList<VCFGenotypeRecord> list = new ArrayList<VCFGenotypeRecord>();
for ( Genotype g : mGenotypeFields )
list.add((VCFGenotypeRecord)g);
return list;
}
public List<Genotype> getGenotypes() {
return mGenotypeFields;
}
public Genotype getCalledGenotype() {
if ( mGenotypeFields == null || mGenotypeFields.size() != 1 )
throw new IllegalStateException("There is not one and only one genotype associated with this record");
VCFGenotypeRecord record = (VCFGenotypeRecord)mGenotypeFields.get(0);
if ( record.isEmptyGenotype() )
return null;
return record;
}
public boolean hasGenotype(DiploidGenotype x) {
if ( mGenotypeFields == null )
return false;
for ( Genotype g : mGenotypeFields ) {
if ( DiploidGenotype.valueOf(g.getBases()).equals(x) )
return true;
}
return false;
}
/**
@ -271,9 +393,10 @@ public class VCFRecord {
*/
public String[] getSampleNames() {
String names[] = new String[mGenotypeFields.size()];
int index = 0;
for (VCFGenotypeRecord rec : this.mGenotypeFields)
names[index++] = rec.getSampleName();
for (int i = 0; i < mGenotypeFields.size(); i++) {
VCFGenotypeRecord rec = (VCFGenotypeRecord)mGenotypeFields.get(i);
names[i] = rec.getSampleName();
}
return names;
}
@ -289,14 +412,12 @@ public class VCFRecord {
this.mReferenceBase = referenceBase;
}
public void setPosition(int mPosition) {
if (mPosition < 0)
public void setLocation(String chrom, int position) {
if ( chrom == null )
throw new IllegalArgumentException("Chromosomes cannot be missing");
if ( position < 0 )
throw new IllegalArgumentException("Position values must be greater than 0");
this.mPosition = mPosition;
}
public void setChomosome(String mChrome) {
this.mChrome = mChrome;
this.mLoc = GenomeLocParser.createGenomeLoc(chrom, position);
}
public void setID(String mID) {
@ -369,9 +490,9 @@ public class VCFRecord {
StringBuilder builder = new StringBuilder();
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
builder.append(getChromosome());
builder.append(mLoc.getContig());
builder.append(FIELD_SEPERATOR);
builder.append(getPosition());
builder.append(mLoc.getStart());
builder.append(FIELD_SEPERATOR);
builder.append(getID());
builder.append(FIELD_SEPERATOR);
@ -416,10 +537,10 @@ public class VCFRecord {
*/
private void addGenotypeData(StringBuilder builder, VCFHeader header) {
builder.append(FIELD_SEPERATOR + this.getGenotypeFormatString());
if (header.getGenotypeSamples().size() < getVCFGenotypeRecords().size())
if (header.getGenotypeSamples().size() < getGenotypes().size())
throw new RuntimeException("We have more genotype samples than the header specified");
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getVCFGenotypeRecords());
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
for (String genotype : header.getGenotypeSamples()) {
builder.append(FIELD_SEPERATOR);
@ -447,8 +568,7 @@ public class VCFRecord {
public boolean equals(VCFRecord other) {
if (!this.mAlts.equals(other.mAlts)) return false;
if (this.mReferenceBase != other.mReferenceBase) return false;
if (!this.mChrome.equals(other.mChrome)) return false;
if (this.mPosition != other.mPosition) return false;
if (!this.mLoc.equals(other.mLoc)) return false;
if (!this.mID.equals(other.mID)) return false;
if (this.mQual != other.mQual) return false;
if (!this.mFilterString.equals(other.mFilterString)) return false;
@ -463,9 +583,10 @@ public class VCFRecord {
* @param list a list of genotype samples
* @return a mapping of the sample name to VCF genotype record
*/
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<Genotype> list) {
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
for (VCFGenotypeRecord rec : list) {
for (int i = 0; i < list.size(); i++) {
VCFGenotypeRecord rec = (VCFGenotypeRecord)list.get(i);
map.put(rec.getSampleName(), rec);
}
return map;

View File

@ -1,9 +1,13 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.junit.Assert;
import org.junit.Test;
import org.junit.BeforeClass;
import java.io.*;
import java.util.ArrayList;
@ -18,6 +22,18 @@ public class VCFReaderTest extends BaseTest {
private static final File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf");
private static final String VCF_MIXUP_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/mixedup.vcf";
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
@Test
public void testVCFInput() {
VCFReader reader = new VCFReader(vcfFile);

View File

@ -1,10 +1,16 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.junit.Assert;
import org.junit.Test;
import org.junit.BeforeClass;
import java.util.*;
import java.io.File;
import java.io.FileNotFoundException;
/**
@ -16,6 +22,18 @@ import java.util.*;
*/
public class VCFRecordTest extends BaseTest {
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/**
* create a fake VCF record
*

View File

@ -1,10 +1,15 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.junit.Assert;
import org.junit.Test;
import org.junit.BeforeClass;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
@ -21,6 +26,18 @@ public class VCFWriterTest extends BaseTest {
private Set<String> additionalColumns = new HashSet<String>();
private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf");
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/** test, using the writer and reader, that we can output and input a VCF file without problems */
@Test
public void testBasicWriteAndRead() {