Cleaning up the VCF code, adding lots of tests for a variety of edge cases. Two issues are still outstanding: updating the no call string with the standard 1000g decided on today, and fixing Eric's issue where not all the VCF sample names are present initially.

also: their, I hope your happy Eric, from now on I'll try not to flout my awesomest grammer in the future accept when I need to illicit a strong response :-)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1858 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-10-16 04:11:34 +00:00
parent b82c3b6040
commit a69ea9b57c
13 changed files with 544 additions and 130 deletions

View File

@ -4,8 +4,11 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
@ -115,8 +118,8 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
this.assertNotNull();
if (!mCurrentRecord.hasAlternateAllele())
return false;
for (String alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.length() != 1)
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE)
return false;
}
return true;
@ -132,8 +135,8 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
this.assertNotNull();
if (!mCurrentRecord.hasAlternateAllele())
return false;
for (String alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.startsWith("I"))
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.getType() == VCFGenotypeEncoding.TYPE.INSERTION)
return true;
}
return false;
@ -149,8 +152,8 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
this.assertNotNull();
if (!mCurrentRecord.hasAlternateAllele())
return false;
for (String alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.startsWith("D"))
for (VCFGenotypeEncoding alt : this.mCurrentRecord.getAlternateAlleles()) {
if (alt.getType() == VCFGenotypeEncoding.TYPE.DELETION)
return true;
}
return false;
@ -208,7 +211,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
public String getAlternateBases() {
if (!this.isBiallelic())
throw new UnsupportedOperationException("We're not biallelic, so please call getAlternateBaseList instead");
return this.mCurrentRecord.getAlternateAlleles().get(0);
return this.mCurrentRecord.getAlternateAlleles().get(0).toString();
}
/**
@ -218,7 +221,10 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*/
@Override
public List<String> getAlternateBaseList() {
return this.mCurrentRecord.getAlternateAlleles();
List<String> list = new ArrayList<String>();
for (VCFGenotypeEncoding enc : mCurrentRecord.getAlternateAlleles())
list.add(enc.toString());
return list;
}
/**
@ -240,7 +246,8 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
@Override
public char getAlternativeBaseForSNP() {
if (!isSNP()) throw new IllegalStateException("we're not a SNP");
return mCurrentRecord.getAlternateAlleles().get(0).charAt(0);
if (mCurrentRecord.getAlternateAlleles().size() != 1) throw new UnsupportedOperationException("We're not a biallelic VCF site");
return (mCurrentRecord.getAlternateAlleles().get(0).toString()).charAt(0);
}
/**

View File

@ -14,10 +14,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File;
import java.io.PrintStream;
@ -103,7 +100,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
int[] alleleNames = {0, 1, 2, 3};
double snpQual = 0.0;
int refbase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
List<String> alts = new ArrayList<String>();
List<VCFGenotypeEncoding> alts = new ArrayList<VCFGenotypeEncoding>();
for (String name : vcfheader.getGenotypeSamples()) {
ReferenceOrderedDatum rod = tracker.lookup(sampleNamesToRods.get(name), null);
if (rod != null) {
@ -118,10 +115,10 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
if (!(rod instanceof VariantBackedByGenotype))
throw new IllegalArgumentException("The passed in variant type must be backed by genotype data");
Genotype genotype = ((VariantBackedByGenotype) rod).getCalledGenotype();
List<String> alleles = new ArrayList<String>();
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
for (char base : genotype.getBases().toCharArray()) {
alleles.add(String.valueOf(base));
if (base != ref.getBase() && !alts.contains(String.valueOf(base))) alts.add(String.valueOf(base));
alleles.add(new VCFGenotypeEncoding(String.valueOf(base)));
if (base != ref.getBase() && !alts.contains(String.valueOf(base))) alts.add(new VCFGenotypeEncoding(String.valueOf(base)));
}
int allele1 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(0));
int allele2 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(1));
@ -141,9 +138,9 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
snpQual += av.getNegLog10PError();
} else {
Map<String, String> str = new HashMap<String, String>();
List<String> alleles = new ArrayList<String>();
alleles.add(String.valueOf(ref.getBase()));
alleles.add(String.valueOf(ref.getBase()));
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase())));
alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase())));
gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED, str));
numRefs++;

View File

@ -0,0 +1,122 @@
package org.broadinstitute.sting.utils.genotype.vcf;
/**
* @author aaron
* <p/>
* Class VCFGenotypeEncoding
* <p/>
* basic encoding class for genotype fields in VCF
*/
public class VCFGenotypeEncoding {
public enum TYPE {
SINGLE_BASE,
INSERTION,
DELETION,
UNCALLED
}
// our length (0 for SINGLE_BASE), our bases, and our type
private final int mLength;
private final String mBases;
private final TYPE mType;
// public constructor, that parses out the base string
public VCFGenotypeEncoding(String baseString) {
if ((baseString.length() == 1)) {
// are we an empty (no-call) genotype?
if (baseString.equals(VCFGenotypeRecord.EMPTY_GENOTYPE)) {
mBases = VCFGenotypeRecord.EMPTY_GENOTYPE;
mLength = 0;
mType = TYPE.UNCALLED;
} else if (!validBases(baseString)) {
throw new IllegalArgumentException("Alleles of length 1 must be one of A,C,G,T, " + baseString + " was passed in");
} else { // we're a valid base
mBases = baseString.toUpperCase();
mLength = 0;
mType = TYPE.SINGLE_BASE;
}
} else { // deletion or insertion
if (baseString.length() < 1 || (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I')) {
throw new IllegalArgumentException("Genotype encoding of " + baseString + " was passed in, but is not a valid deletion, insertion, base, or no call (.)");
}
if (baseString.toUpperCase().charAt(0) == 'D') {
mLength = Integer.valueOf(baseString.substring(1, baseString.length()));
mBases = "";
mType = TYPE.DELETION;
} else { // we're an I
mBases = baseString.substring(1, baseString.length()).toUpperCase();
if (!validBases(mBases))
throw new IllegalArgumentException("The insertion base string contained invalid bases -> " + baseString);
mLength = mBases.length();
mType = TYPE.INSERTION;
}
}
}
public int getLength() {
return mLength;
}
public String getBases() {
return mBases;
}
public TYPE getType() {
return mType;
}
public boolean equals(Object obj) {
if (obj != null && (obj.getClass().equals(this.getClass()))) {
VCFGenotypeEncoding d = (VCFGenotypeEncoding) obj;
return (mType == d.mType) && (mBases.equals(d.mBases)) && (mLength == d.mLength);
}
return false;
}
public int hashCode() {
// our underlying data is immutable, so this is safe (we won't strand a value in a hashtable somewhere
// when the data changes underneath, altering this value).
String str = this.mBases + String.valueOf(this.mLength) + this.mType.toString();
return str.hashCode();
}
/**
* dump the string representation of this genotype encoding
*
* @return
*/
public String toString() {
StringBuilder builder = new StringBuilder();
switch (mType) {
case SINGLE_BASE:
case UNCALLED:
builder.append(mBases);
break;
case INSERTION:
builder.append("I");
builder.append(mBases);
break;
case DELETION:
builder.append("D");
builder.append(mLength);
break;
}
return builder.toString();
}
/**
* ensure that string contains valid bases
*
* @param bases the bases to check
*
* @return true if they're all either A,C,G,T; false otherwise
*/
private static boolean validBases(String bases) {
for (char c : bases.toUpperCase().toCharArray()) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T')
return false;
}
return true;
}
}

View File

@ -27,7 +27,7 @@ public class VCFGenotypeRecord {
private PHASE mPhaseType;
// our bases(s)
private final List<String> mGenotypeAlleles = new ArrayList<String>();
private final List<VCFGenotypeEncoding> mGenotypeAlleles = new ArrayList<VCFGenotypeEncoding>();
// our mapping of the format mFields to values
private final Map<String, String> mFields = new HashMap<String, String>();
@ -43,7 +43,7 @@ public class VCFGenotypeRecord {
* @param phasing
* @param otherFlags
*/
public VCFGenotypeRecord(String sampleName, List<String> genotypes, PHASE phasing, Map<String, String> otherFlags) {
public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing, Map<String, String> otherFlags) {
this.mSampleName = sampleName;
if (genotypes != null) this.mGenotypeAlleles.addAll(genotypes);
this.mPhaseType = phasing;
@ -78,7 +78,7 @@ public class VCFGenotypeRecord {
return mSampleName;
}
public List<String> getAlleles() {
public List<VCFGenotypeEncoding> getAlleles() {
return mGenotypeAlleles;
}
@ -86,10 +86,10 @@ public class VCFGenotypeRecord {
return mFields;
}
public String toGenotypeString(List<String> altAlleles) {
public String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
String str = "";
boolean first = true;
for (String allele : mGenotypeAlleles) {
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
if (first) {
switch (mPhaseType) {

View File

@ -199,8 +199,8 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
map.put("GQ", String.format("%.2f", qual));
params.addFormatItem("GQ");
List<String> alleles = createAlleleArray(gtype);
for (String allele : alleles) {
List<VCFGenotypeEncoding> alleles = createAlleleArray(gtype);
for (VCFGenotypeEncoding allele : alleles) {
params.addAlternateBase(allele);
}
@ -218,10 +218,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
*
* @return a list of string representing the string array of alleles
*/
private List<String> createAlleleArray(Genotype gtype) {
List<String> alleles = new ArrayList<String>();
private List<VCFGenotypeEncoding> createAlleleArray(Genotype gtype) {
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
for (char allele : gtype.getBases().toCharArray()) {
alleles.add(String.valueOf(allele));
alleles.add(new VCFGenotypeEncoding(String.valueOf(allele)));
}
return alleles;
}

View File

@ -18,7 +18,7 @@ class VCFParameters {
private boolean initialized = false;
private List<VCFGenotypeRecord> genotypesRecord = new ArrayList<VCFGenotypeRecord>();
private List<String> formatList = new ArrayList<String>();
private List<String> alternateBases = new ArrayList<String>();
private List<VCFGenotypeEncoding> alternateBases = new ArrayList<VCFGenotypeEncoding>();
public void setLocations(GenomeLoc location, char refBase) {
// if we haven't set it up, we initialize the object
@ -64,12 +64,12 @@ class VCFParameters {
formatList.add(item);
}
public void addAlternateBase(String base) {
if (!alternateBases.contains(String.valueOf(base)) && !base.equals(String.valueOf(this.getReferenceBase())))
public void addAlternateBase(VCFGenotypeEncoding base) {
if (!alternateBases.contains(base) && !base.toString().equals(String.valueOf(this.getReferenceBase())))
alternateBases.add(base);
}
public List<String> getAlternateBases() {
public List<VCFGenotypeEncoding> getAlternateBases() {
return alternateBases;
}

View File

@ -216,7 +216,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
// parameters to create the VCF genotype record
Map<String, String> tagToValue = new HashMap<String, String>();
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
List<String> bases = new ArrayList<String>();
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
String keyStrings[] = formatString.split(":");
for (String key : keyStrings) {
@ -262,15 +262,15 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
* @param referenceBase the reference base
* @param bases the list of bases for this genotype call
*/
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<String> bases) {
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<VCFGenotypeEncoding> bases) {
int alleleValue = Integer.valueOf(alleleNumber);
// check to make sure the allele value is within bounds
if (alleleValue < 0 || alleleValue > altAlleles.length)
throw new IllegalArgumentException("VCFReader: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
if (alleleValue == 0)
bases.add(String.valueOf(referenceBase));
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
else
bases.add(altAlleles[alleleValue - 1]);
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
}

View File

@ -5,7 +5,9 @@ import org.broadinstitute.sting.utils.Utils;
import java.util.*;
/** the basic VCF record type */
/**
* the basic VCF record type
*/
public class VCFRecord {
// commonly used strings that are in the standard
public static final String FORMAT_FIELD_SEPERATOR = ":";
@ -24,7 +26,7 @@ public class VCFRecord {
// our id; set to '.' if not available
private String mID;
// the alternate bases
private final List<String> mAlts = new ArrayList<String>();
private final List<VCFGenotypeEncoding> mAlts = new ArrayList<VCFGenotypeEncoding>();
// our qual value
private double mQual;
// our filter string
@ -67,7 +69,7 @@ public class VCFRecord {
String contig,
int position,
String ID,
List<String> altBases,
List<VCFGenotypeEncoding> altBases,
double qual,
String filters,
Map<String, String> infoFields,
@ -77,7 +79,7 @@ public class VCFRecord {
this.mChrome = contig;
this.setPosition(position);
this.mID = ID;
for (String alt : altBases)
for (VCFGenotypeEncoding alt : altBases)
this.addAlternateBase(alt);
this.setQual(qual);
this.setFilterString(filters);
@ -121,7 +123,7 @@ public class VCFRecord {
case ALT:
String values[] = columnValues.get(val).split(",");
for (String alt : values)
addAlternateBase(alt);
addAlternateBase(new VCFGenotypeEncoding(alt));
break;
case QUAL:
this.setQual(Double.valueOf(columnValues.get(val)));
@ -156,18 +158,24 @@ public class VCFRecord {
return (mGenotypeFields.size() > 0);
}
/** @return the string for the chromosome that this VCF record is associated with */
/**
* @return the string for the chromosome that this VCF record is associated with
*/
public String getChromosome() {
return this.mChrome;
}
/** @return this VCF records position on the specified chromosome */
/**
* @return this VCF records position on the specified chromosome
*/
public long getPosition() {
return this.mPosition;
}
/** @return the ID value for this record */
/**
* @return the ID value for this record
*/
public String getID() {
return this.mID;
}
@ -186,7 +194,7 @@ public class VCFRecord {
*
* @return an array of strings representing the alt alleles, or null if there are none
*/
public List<String> getAlternateAlleles() {
public List<VCFGenotypeEncoding> getAlternateAlleles() {
return this.mAlts;
}
@ -194,7 +202,9 @@ public class VCFRecord {
return getAlternateAlleles().size() > 0;
}
/** @return the phred-scaled quality score */
/**
* @return the phred-scaled quality score
*/
public double getQual() {
return this.mQual;
}
@ -206,7 +216,7 @@ public class VCFRecord {
*/
public String[] getFilteringCodes() {
if (mFilterString == null) return new String[]{"0"};
return this.mFilterString.split(";");
return this.mFilterString.split(FILTER_CODE_SEPERATOR);
}
public boolean hasFilteringCodes() {
@ -227,7 +237,9 @@ public class VCFRecord {
return this.mInfoFields;
}
/** @return the number of columnsof data we're storing */
/**
* @return the number of columnsof data we're storing
*/
public int getColumnCount() {
if (this.hasGenotypeData()) return mGenotypeFields.size() + VCFHeader.HEADER_FIELDS.values().length;
return VCFHeader.HEADER_FIELDS.values().length;
@ -242,7 +254,9 @@ public class VCFRecord {
return this.mGenotypeFields;
}
/** @return a List of the sample names */
/**
* @return a List of the sample names
*/
public String[] getSampleNames() {
String names[] = new String[mGenotypeFields.size()];
int index = 0;
@ -287,24 +301,26 @@ public class VCFRecord {
this.mFilterString = mFilterString;
}
public void addGenotypeFields(VCFGenotypeRecord mGenotypeFields) {
public void addGenotypeField(VCFGenotypeRecord mGenotypeFields) {
this.mGenotypeFields.add(mGenotypeFields);
}
public void addAlternateBase(String base) {
if (base.length() == 1) {
char nuc = (char) ((base.charAt(0) > 96) ? base.charAt(0) - 32 : base.charAt(0));
if (nuc != 'A' && nuc != 'C' && nuc != 'T' && nuc != 'G' && nuc != '.')
throw new IllegalArgumentException("Alternate base must be either A,C,T,G,. or if an indel it must contain length information: " + base);
} else {
// we must be an indel, check that the first character is I or D
char nuc = (char) ((base.charAt(0) > 96) ? base.charAt(0) - 32 : base.charAt(0));
if (nuc != 'I' && nuc != 'D')
throw new IllegalArgumentException("Alternate bases of length greater then one must be an indel: " + base);
}
this.mAlts.add(base);
/**
* add an alternate base to our alternate base list. All bases are uppercased
* before being added to the list.
*
* @param base the base to add
*/
public void addAlternateBase(VCFGenotypeEncoding base) {
if (!mAlts.contains(base)) mAlts.add(base);
}
/**
* add an info field to the record
*
* @param key the key, from the spec or a user created key
* @param value it's value as a string
*/
public void addInfoField(String key, String value) {
this.mInfoFields.put(key, value);
}
@ -312,31 +328,29 @@ public class VCFRecord {
/**
* the generation of a string representation, which is used by the VCF writer
*
* @param header the VCF header for this VCF Record
* @return a string
*/
public String toStringRepresentation(VCFHeader header) {
StringBuilder builder = new StringBuilder();
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
builder.append(getChromosome() + FIELD_SEPERATOR);
builder.append(getPosition() + FIELD_SEPERATOR);
builder.append(getID() + FIELD_SEPERATOR);
builder.append(getReferenceBase() + FIELD_SEPERATOR);
builder.append(getChromosome());
builder.append(FIELD_SEPERATOR);
builder.append(getPosition());
builder.append(FIELD_SEPERATOR);
builder.append(getID());
builder.append(FIELD_SEPERATOR);
builder.append(getReferenceBase());
builder.append(FIELD_SEPERATOR);
String alts = "";
for (String str : this.getAlternateAlleles()) alts += str + ",";
for (VCFGenotypeEncoding str : this.getAlternateAlleles()) alts += str.toString() + ",";
builder.append((alts.length() > 0) ? alts.substring(0, alts.length() - 1) + FIELD_SEPERATOR : "." + FIELD_SEPERATOR);
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING,getQual()) + FIELD_SEPERATOR);
builder.append(Utils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()) + FIELD_SEPERATOR);
String info = "";
for (String str : this.getInfoValues().keySet()) {
if (str.equals(EMPTY_INFO_FIELD))
info = EMPTY_INFO_FIELD;
else
info += str + "=" + getInfoValues().get(str) + INFO_FIELD_SEPERATOR;
}
if (info.length() > 1) builder.append(info.substring(0, info.length() - 1));
else builder.append(info);
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, getQual()));
builder.append(FIELD_SEPERATOR);
builder.append(Utils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()));
builder.append(FIELD_SEPERATOR);
builder.append(createInfoString());
if (this.hasGenotypeData()) {
addGenotypeData(builder, header);
@ -344,6 +358,22 @@ public class VCFRecord {
return builder.toString();
}
/**
* create the info string
*
* @return a string representing the infomation fields
*/
protected String createInfoString() {
String info = "";
for (String str : this.getInfoValues().keySet()) {
if (str.equals(EMPTY_INFO_FIELD))
return EMPTY_INFO_FIELD;
else
info += str + "=" + getInfoValues().get(str) + INFO_FIELD_SEPERATOR;
}
return (info.contains(INFO_FIELD_SEPERATOR)) ? info.substring(0, info.lastIndexOf(INFO_FIELD_SEPERATOR)) : info;
}
/**
* add the genotype data
*
@ -358,9 +388,7 @@ public class VCFRecord {
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getVCFGenotypeRecords());
for (String genotype : header.getGenotypeSamples()) {
builder.append(FIELD_SEPERATOR);
if (gMap.containsKey(genotype)) {
VCFGenotypeRecord rec = gMap.get(genotype);
if (!rec.toGenotypeString(this.mAlts).equals(""))
@ -386,7 +414,6 @@ public class VCFRecord {
* compare two VCF records
*
* @param other the other VCF record
*
* @return true if they're equal
*/
public boolean equals(VCFRecord other) {
@ -406,7 +433,6 @@ public class VCFRecord {
* create a genotype mapping from a list and their sample names
*
* @param list a list of genotype samples
*
* @return a mapping of the sample name to VCF genotype record
*/
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {

View File

@ -91,7 +91,7 @@ public class RodVCFTest extends BaseTest {
@Test
public void testToString() {
// slightly altered line, due to map ordering
String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tDP=258;AF=0.786;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n";
final String firstLine = "20\t14370\trs6054257\tG\tA\t29.00\t0\tDP=258;AF=0.786;NS=58\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5\n";
RodVCF vcf = getVCFObject();
VCFReader reader = new VCFReader(vcfFile);
Iterator<RodVCF> iter = vcf.createIterator("VCF", vcfFile);

View File

@ -0,0 +1,151 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.BaseTest;
import org.junit.Assert;
import org.junit.Test;
/**
* @author aaron
* <p/>
* Class VCFGenotypeEncodingTest
* <p/>
* test the VCFGenotypeEncoding class
*/
public class VCFGenotypeEncodingTest extends BaseTest {
@Test
public void testDecodingSingle() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("A");
Assert.assertTrue("A".equals(enc.toString()));
Assert.assertEquals(0, enc.getLength());
Assert.assertEquals(VCFGenotypeEncoding.TYPE.SINGLE_BASE, enc.getType());
VCFGenotypeEncoding enc2 = new VCFGenotypeEncoding("C");
Assert.assertTrue("C".equals(enc2.toString()));
Assert.assertEquals(0, enc.getLength());
Assert.assertEquals(VCFGenotypeEncoding.TYPE.SINGLE_BASE, enc.getType());
VCFGenotypeEncoding enc3 = new VCFGenotypeEncoding("G");
Assert.assertTrue("G".equals(enc3.toString()));
Assert.assertEquals(0, enc.getLength());
Assert.assertEquals(VCFGenotypeEncoding.TYPE.SINGLE_BASE, enc.getType());
VCFGenotypeEncoding enc4 = new VCFGenotypeEncoding("T");
Assert.assertTrue("T".equals(enc4.toString()));
Assert.assertEquals(0, enc.getLength());
Assert.assertEquals(VCFGenotypeEncoding.TYPE.SINGLE_BASE, enc.getType());
VCFGenotypeEncoding enc5 = new VCFGenotypeEncoding("a");
Assert.assertTrue("A".equals(enc5.toString()));
Assert.assertEquals(0, enc.getLength());
Assert.assertEquals(VCFGenotypeEncoding.TYPE.SINGLE_BASE, enc.getType());
VCFGenotypeEncoding enc6 = new VCFGenotypeEncoding("c");
Assert.assertTrue("C".equals(enc6.toString()));
Assert.assertEquals(0, enc.getLength());
Assert.assertEquals(VCFGenotypeEncoding.TYPE.SINGLE_BASE, enc.getType());
VCFGenotypeEncoding enc7 = new VCFGenotypeEncoding("g");
Assert.assertTrue("G".equals(enc7.toString()));
Assert.assertEquals(0, enc.getLength());
Assert.assertEquals(VCFGenotypeEncoding.TYPE.SINGLE_BASE, enc.getType());
VCFGenotypeEncoding enc8 = new VCFGenotypeEncoding("t");
Assert.assertTrue("T".equals(enc8.toString()));
Assert.assertEquals(0, enc.getLength());
Assert.assertEquals(VCFGenotypeEncoding.TYPE.SINGLE_BASE, enc.getType());
}
@Test(expected = IllegalArgumentException.class)
public void testDecodingSingleBadBase() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("E");
}
@Test(expected = IllegalArgumentException.class)
public void testDecodingSingleWrongBase() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("I");
}
@Test
public void testValidIndel() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("IAGGC");
Assert.assertEquals(4, enc.getLength());
Assert.assertTrue(enc.getBases().equals("AGGC"));
Assert.assertEquals(VCFGenotypeEncoding.TYPE.INSERTION, enc.getType());
}
@Test(expected = IllegalArgumentException.class)
public void testBadIndel() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("IAGRC");
}
@Test
public void testValidDel() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("D40");
Assert.assertEquals(40, enc.getLength());
Assert.assertTrue(enc.getBases().equals(""));
Assert.assertEquals(VCFGenotypeEncoding.TYPE.DELETION, enc.getType());
}
@Test(expected = IllegalArgumentException.class)
public void testBadDel() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("DAGCT");
}
@Test
public void testValidNoCall() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding(".");
Assert.assertEquals(0, enc.getLength());
Assert.assertTrue(enc.getBases().equals("."));
Assert.assertEquals(VCFGenotypeEncoding.TYPE.UNCALLED, enc.getType());
}
@Test(expected = IllegalArgumentException.class)
public void testBadNoCall() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("..");
}
@Test
public void testEquals() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("A");
VCFGenotypeEncoding enc2 = new VCFGenotypeEncoding("A");
VCFGenotypeEncoding enc3 = new VCFGenotypeEncoding("C");
Assert.assertTrue(enc.equals(enc2));
Assert.assertTrue(!enc.equals(enc3));
enc = new VCFGenotypeEncoding("D40");
enc2 = new VCFGenotypeEncoding("D40");
enc3 = new VCFGenotypeEncoding("D41");
Assert.assertTrue(enc.equals(enc2));
Assert.assertTrue(!enc.equals(enc3));
enc = new VCFGenotypeEncoding("IAAC");
enc2 = new VCFGenotypeEncoding("IAAC");
enc3 = new VCFGenotypeEncoding("IACG");
Assert.assertTrue(enc.equals(enc2));
Assert.assertTrue(!enc.equals(enc3));
enc = new VCFGenotypeEncoding(".");
enc2 = new VCFGenotypeEncoding(".");
Assert.assertTrue(enc.equals(enc2));
}
@Test
public void testHashCode() {
VCFGenotypeEncoding enc = new VCFGenotypeEncoding("A");
VCFGenotypeEncoding enc2 = new VCFGenotypeEncoding("A");
VCFGenotypeEncoding enc3 = new VCFGenotypeEncoding("C");
Assert.assertTrue(enc.hashCode() == enc2.hashCode());
Assert.assertTrue(enc.hashCode() != enc3.hashCode());
enc = new VCFGenotypeEncoding("D40");
enc2 = new VCFGenotypeEncoding("D40");
enc3 = new VCFGenotypeEncoding("D41");
Assert.assertTrue(enc.hashCode() == enc2.hashCode());
Assert.assertTrue(enc.hashCode() != enc3.hashCode());
enc = new VCFGenotypeEncoding("IAAC");
enc2 = new VCFGenotypeEncoding("IAAC");
enc3 = new VCFGenotypeEncoding("IACG");
Assert.assertTrue(enc.hashCode() == enc2.hashCode());
Assert.assertTrue(enc.hashCode() != enc3.hashCode());
enc = new VCFGenotypeEncoding(".");
enc2 = new VCFGenotypeEncoding(".");
Assert.assertTrue(enc.hashCode() == enc2.hashCode());
}
}

View File

@ -37,12 +37,12 @@ public class VCFReaderTest extends BaseTest {
public void testBasicParsing() {
String formatString = "GT:B:C:D";
String genotypeString = "0|1:2:3:4";
String altAlleles[] = {"A","C","G","T"};
char referenceBase = 'N';
String altAlleles[] = {"A","G","T"};
char referenceBase = 'C';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals("N",rec.getAlleles().get(0));
Assert.assertEquals("A",rec.getAlleles().get(1));
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
Map<String,String> values = rec.getFields();
Assert.assertEquals(3,values.size());
Assert.assertTrue(values.get("B").equals("2"));
@ -58,12 +58,12 @@ public class VCFReaderTest extends BaseTest {
public void testMissingFieldParsing() {
String formatString = "GT:B:C:D";
String genotypeString = "0|1:::4";
String altAlleles[] = {"A","C","G","T"};
char referenceBase = 'N';
String altAlleles[] = {"A","G","T"};
char referenceBase = 'C';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals("N",rec.getAlleles().get(0));
Assert.assertEquals("A",rec.getAlleles().get(1));
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
Map<String,String> values = rec.getFields();
Assert.assertEquals(3,values.size());
Assert.assertTrue(values.get("B").equals(""));
@ -78,12 +78,12 @@ public class VCFReaderTest extends BaseTest {
public void testMissingAllFields() {
String formatString = "GT:B:C:D";
String genotypeString = "0|1:::";
String altAlleles[] = {"A","C","G","T"};
char referenceBase = 'N';
String altAlleles[] = {"A","G","T"};
char referenceBase = 'C';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals("N",rec.getAlleles().get(0));
Assert.assertEquals("A",rec.getAlleles().get(1));
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
Map<String,String> values = rec.getFields();
Assert.assertEquals(3,values.size());
Assert.assertTrue(values.get("B").equals(""));

View File

@ -11,36 +11,147 @@ import java.util.Map;
/**
*
* @author aaron
*
* Class VCFRecordTest
*
* test the basic functionality of the vcf record
* @author aaron
* <p/>
* Class VCFRecordTest
* <p/>
* test the basic functionality of the vcf record
*/
public class VCFRecordTest extends BaseTest {
private VCFRecord makeFakeVCFRecord() {
List<String> altBases = new ArrayList<String>();
altBases.add("C");
altBases.add("D1");
Map<String,String> infoFields = new HashMap<String,String>();
infoFields.put("DP","50");
/**
* create a fake VCF record
*
* @return a VCFRecord
*/
private static VCFRecord makeFakeVCFRecord(Map<String, String> infoFields) {
List<VCFGenotypeEncoding> altBases = new ArrayList<VCFGenotypeEncoding>();
altBases.add(new VCFGenotypeEncoding("C"));
altBases.add(new VCFGenotypeEncoding("D1"));
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>();
Map<String, String> keyValues = new HashMap<String,String>();
keyValues.put("AA","2");
List<String> Alleles = new ArrayList<String>();
Alleles.add("A");
genotypeObjects.add(new VCFGenotypeRecord("SampleName", Alleles, VCFGenotypeRecord.PHASE.PHASED, keyValues));
return new VCFRecord('A',"chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",genotypeObjects);
genotypeObjects.add(createGenotype("sample1", "A", "A"));
return new VCFRecord('A', "chr1", 1, "RANDOM", altBases, 0, ".", infoFields, "GT:AA", genotypeObjects);
}
/**
* create a fake VCF genotype record
*
* @param name the name of the sample
* @param Allele1 the first allele
* @param Allele2 the second allele
* @return a VCFGenotypeRecord
*/
private static VCFGenotypeRecord createGenotype(String name, String Allele1, String Allele2) {
Map<String, String> keyValues = new HashMap<String, String>();
keyValues.put("AA", "2");
List<VCFGenotypeEncoding> Alleles = new ArrayList<VCFGenotypeEncoding>();
Alleles.add(new VCFGenotypeEncoding(Allele1));
Alleles.add(new VCFGenotypeEncoding(Allele2));
return new VCFGenotypeRecord(name, Alleles, VCFGenotypeRecord.PHASE.PHASED, keyValues);
}
@Test
public void testAddReduntantAlts() {
List<VCFGenotypeEncoding> altBases = new ArrayList<VCFGenotypeEncoding>();
altBases.add(new VCFGenotypeEncoding("C"));
altBases.add(new VCFGenotypeEncoding("D1"));
altBases.add(new VCFGenotypeEncoding("D1"));
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>();
genotypeObjects.add(createGenotype("sample1", "A", "A"));
VCFRecord rec = new VCFRecord('A', "chr1", 1, "RANDOM", altBases, 0, ".", new HashMap<String,String>(), "GT:AA", genotypeObjects);
Assert.assertEquals(2, rec.getAlternateAlleles().size());
}
@Test
public void testGetOneGenotype() {
Map<String, String> infoFields = new HashMap<String, String>();
VCFRecord rec = makeFakeVCFRecord(infoFields);
List<VCFGenotypeRecord> genotypeObjects = rec.getVCFGenotypeRecords();
Assert.assertEquals(1, genotypeObjects.size());
Assert.assertTrue(genotypeObjects.get(0).getSampleName().equals("sample1"));
Assert.assertEquals(2, genotypeObjects.get(0).getAlleles().size());
Assert.assertEquals("A", genotypeObjects.get(0).getAlleles().get(0).toString());
Assert.assertEquals("A", genotypeObjects.get(0).getAlleles().get(1).toString());
}
@Test
public void testGetGenotypes() {
Map<String, String> infoFields = new HashMap<String, String>();
VCFRecord rec = makeFakeVCFRecord(infoFields);
rec.addGenotypeField(createGenotype("sample2", "C", "A"));
List<VCFGenotypeRecord> genotypeObjects = rec.getVCFGenotypeRecords();
Assert.assertEquals(2, genotypeObjects.size());
Assert.assertTrue(genotypeObjects.get(0).getSampleName().equals("sample1"));
Assert.assertEquals(2, genotypeObjects.get(0).getAlleles().size());
Assert.assertEquals("A", genotypeObjects.get(0).getAlleles().get(0).toString());
Assert.assertEquals("A", genotypeObjects.get(0).getAlleles().get(1).toString());
// assert the second one
Assert.assertTrue(genotypeObjects.get(1).getSampleName().equals("sample2"));
Assert.assertEquals(2, genotypeObjects.get(1).getAlleles().size());
Assert.assertEquals("C", genotypeObjects.get(1).getAlleles().get(0).toString());
Assert.assertEquals("A", genotypeObjects.get(1).getAlleles().get(1).toString());
}
@Test
public void testCreateInfoString() {
Map<String, String> infoFields = new HashMap<String, String>();
VCFRecord rec = makeFakeVCFRecord(infoFields);
Assert.assertTrue(rec.createInfoString().equals("."));
infoFields.put("DP", "50");
VCFRecord rec2 = makeFakeVCFRecord(infoFields);
Assert.assertTrue(rec2.createInfoString().equals("DP=50"));
rec2.addInfoField("AB", "CD");
Assert.assertTrue(rec2.createInfoString().equals("DP=50;AB=CD") || rec2.createInfoString().equals("AB=CD;DP=50"));
}
@Test
public void testGetGenotypes() {
VCFRecord rec = makeFakeVCFRecord();
List<VCFGenotypeRecord> genotypeObjects = rec.getVCFGenotypeRecords();
Assert.assertEquals(1,genotypeObjects.size());
Assert.assertTrue(genotypeObjects.get(0).getSampleName().equals("SampleName"));
public void testAddAlts() {
Map<String, String> infoFields = new HashMap<String, String>();
VCFRecord rec = makeFakeVCFRecord(infoFields);
rec.addAlternateBase(new VCFGenotypeEncoding("T"));
rec.addAlternateBase(new VCFGenotypeEncoding("T"));
rec.addAlternateBase(new VCFGenotypeEncoding("T"));
rec.addAlternateBase(new VCFGenotypeEncoding("T"));
rec.addAlternateBase(new VCFGenotypeEncoding("T"));
Assert.assertEquals(3,rec.getAlternateAlleles().size());
}
/**
* create a fake header of known quantity
*
* @return a fake VCF header
*/
public static VCFHeader createFakeHeader() {
Map<String, String> metaData = new HashMap();
List<String> additionalColumns = new ArrayList<String>();
metaData.put("format", "VCRv3.2"); // required
metaData.put("two", "2");
additionalColumns.add("FORMAT");
additionalColumns.add("sample1");
return new VCFHeader(metaData, additionalColumns);
}
private static final String stringRep = "chr1\t1\tRANDOM\tA\tC,D1\t0.00\t.\tDP=50\tGT:AA\t0|0:2";
private static final String stringRep2 = "chr1\t1\tRANDOM\tA\tC,D1\t0.00\t.\tAB=CD;DP=50\tGT:AA\t0|0:2";
//private static final String stringRep3 = "chr1\t1\tRANDOM\tA\tC,D1\t0.00\t.\tAB=CD;DP=50\tGT:AA\t0|0:2";
@Test
public void testStringRepresentation() {
Map<String, String> infoFields = new HashMap<String, String>();
infoFields.put("DP", "50");
VCFRecord rec = makeFakeVCFRecord(infoFields);
Map<String, String> metaData = new HashMap<String, String>();
List<String> additionalColumns = new ArrayList<String>();
String rep = rec.toStringRepresentation(createFakeHeader());
Assert.assertTrue(stringRep.equals(rep));
rec.addInfoField("AB", "CD");
String rep2 = rec.toStringRepresentation(createFakeHeader());
Assert.assertTrue(stringRep2.equals(rep2));
//rec.addGenotypeField(createGenotype("sample3","A","D12"));
}
}

View File

@ -24,7 +24,7 @@ public class VCFWriterTest extends BaseTest {
/** test, using the writer and reader, that we can output and input a VCF file without problems */
@Test
public void testBasicWriteAndRead() {
VCFHeader header = createFakeHeader();
VCFHeader header = createFakeHeader(metaData,additionalColumns);
VCFWriter writer = new VCFWriter(header,fakeVCFFile);
writer.addRecord(createVCFRecord(header));
writer.addRecord(createVCFRecord(header));
@ -45,7 +45,7 @@ public class VCFWriterTest extends BaseTest {
* create a fake header of known quantity
* @return a fake VCF header
*/
private VCFHeader createFakeHeader() {
public static VCFHeader createFakeHeader(Map<String, String> metaData, List<String> additionalColumns) {
metaData.put("format", "VCRv3.2"); // required
metaData.put("two", "2");
additionalColumns.add("FORMAT");
@ -60,9 +60,9 @@ public class VCFWriterTest extends BaseTest {
* @return a VCFRecord
*/
private VCFRecord createVCFRecord(VCFHeader header) {
List<String> altBases = new ArrayList<String>();
altBases.add("C");
altBases.add("D1");
List<VCFGenotypeEncoding> altBases = new ArrayList<VCFGenotypeEncoding>();
altBases.add(new VCFGenotypeEncoding("C"));
altBases.add(new VCFGenotypeEncoding("D1"));
Map<String,String> infoFields = new HashMap<String,String>();
infoFields.put("DP","50");
@ -71,9 +71,9 @@ public class VCFWriterTest extends BaseTest {
Map<String,String> str = new HashMap<String,String>();
str.put("bb","0");
List<String> myAlleles = new ArrayList<String>();
myAlleles.add("C");
myAlleles.add("D1");
List<VCFGenotypeEncoding> myAlleles = new ArrayList<VCFGenotypeEncoding>();
myAlleles.add(new VCFGenotypeEncoding("C"));
myAlleles.add(new VCFGenotypeEncoding("D1"));
gt.add(new VCFGenotypeRecord(name, myAlleles, VCFGenotypeRecord.PHASE.PHASED, str));
}
return new VCFRecord('A',"chr1",1,"RANDOM",altBases,0,".",infoFields, "GT:AA",gt);