Cleaned up the interface to VCFRecord. It's now possible (and easy) to create records and then write them with a VCFWriter.

I've updated HapMap2VCF to use the new interface; Chris agreed to take care of Sequenom2VCF.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2558 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-01-11 21:42:12 +00:00
parent 3feead019d
commit 040fdfee61
4 changed files with 104 additions and 81 deletions

View File

@ -8,12 +8,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariationCall;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.*;
import java.io.File;
@ -26,11 +21,11 @@ public class HapMap2VCF extends RodWalker<Integer, Integer> {
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
protected File VCF_OUT;
private VCFGenotypeWriterAdapter vcfWriter;
private VCFWriter vcfWriter;
String[] sample_ids = null;
public void initialize() {
vcfWriter = new VCFGenotypeWriterAdapter(VCF_OUT);
vcfWriter = new VCFWriter(VCF_OUT);
}
/**
@ -43,7 +38,6 @@ public class HapMap2VCF extends RodWalker<Integer, Integer> {
for ( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
if ( rod instanceof HapMapGenotypeROD ) {
GenomeLoc loc = context.getLocation();
HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod;
// If this is the first time map is called, we need to fill out the sample_ids from the rod
@ -58,37 +52,49 @@ public class HapMap2VCF extends RodWalker<Integer, Integer> {
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "HapMap2VCF"));
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
// write out the header once
vcfWriter.writeHeader(sample_id_set, hInfo);
vcfWriter.writeHeader(new VCFHeader(hInfo, sample_id_set));
}
// Get reference base
Character ref_allele = ref.getBase();
VCFGenotypeEncoding refAllele = new VCFGenotypeEncoding(ref_allele.toString());
// Print each sample's genotype info
List<Genotype> genotype_calls = new ArrayList<Genotype>();
// Create a new record
VCFRecord record = new VCFRecord(ref_allele, context.getLocation(), "GT");
// Record each sample's genotype info
String[] hapmap_rod_genotypes = hapmap_rod.getGenotypes();
for (int i = 0; i < hapmap_rod_genotypes.length; i++) {
String sample_id = sample_ids[i];
String hapmap_rod_genotype = hapmap_rod_genotypes[i];
// for each sample, set the genotype if it exists
VCFGenotypeCall genotype = new VCFGenotypeCall(ref_allele, loc);
if (!hapmap_rod_genotype.contains("N")) {
genotype.setGenotype(DiploidGenotype.createDiploidGenotype(hapmap_rod_genotype.charAt(0), hapmap_rod_genotype.charAt(1)));
genotype.setSampleName(sample_id);
genotype_calls.add(genotype);
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
VCFGenotypeEncoding allele1 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(0,1));
VCFGenotypeEncoding allele2 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(1));
alleles.add(allele1);
alleles.add(allele2);
VCFGenotypeRecord genotype = new VCFGenotypeRecord(sample_id, alleles, VCFGenotypeRecord.PHASE.UNPHASED);
record.addGenotypeRecord(genotype);
if ( !allele1.equals(refAllele) )
record.addAlternateBase(allele1);
if ( !allele2.equals(refAllele) )
record.addAlternateBase(allele2);
}
}
// add each genotype to VCF record and write it
VCFVariationCall call = new VCFVariationCall(ref_allele, loc, Variation.VARIANT_TYPE.SNP);
// Add dbsnp ID
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
if (dbsnp != null)
call.setID(dbsnp.getRS_ID());
vcfWriter.addMultiSampleCall(genotype_calls, call);
record.setID(dbsnp.getRS_ID());
// Write the VCF record
vcfWriter.addRecord(record);
}
}

View File

@ -56,10 +56,23 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
private final String mGenotypeFormatString;
// our genotype sample fields
private final List<Genotype> mGenotypeFields = new ArrayList<Genotype>();
private final List<Genotype> mGenotypeRecords = new ArrayList<Genotype>();
/**
* given a VCF header, and the values for each of the columns, create a VCF record.
* given a reference base, a location, and the format string, create a VCF record.
*
* @param referenceBase the reference base to use
* @param location our genomic location
* @param genotypeFormatString the format string
*/
public VCFRecord(char referenceBase, GenomeLoc location, String genotypeFormatString) {
setReferenceBase(referenceBase);
setLocation(location);
mGenotypeFormatString = genotypeFormatString;
}
/**
* given the values for each of the columns, create a VCF record.
*
* @param columnValues a mapping of header strings to values
* @param formatString the format string for the genotype records
@ -67,10 +80,20 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
*/
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues, String formatString, List<VCFGenotypeRecord> genotypeRecords) {
extractFields(columnValues);
mGenotypeFields.addAll(genotypeRecords);
mGenotypeRecords.addAll(genotypeRecords);
mGenotypeFormatString = formatString;
}
/**
* given the values for each of the columns, create a VCF record.
*
* @param columnValues a mapping of header strings to values
*/
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
extractFields(columnValues);
mGenotypeFormatString = "";
}
/**
* create a VCF record
*
@ -95,8 +118,8 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
Map<String, String> infoFields,
String genotypeFormatString,
List<VCFGenotypeRecord> genotypeObjects) {
this.setReferenceBase(referenceBase);
this.setLocation(contig, position);
setReferenceBase(referenceBase);
setLocation(contig, position);
this.mID = ID;
for (VCFGenotypeEncoding alt : altBases)
this.addAlternateBase(alt);
@ -104,17 +127,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
this.setFilterString(filters);
this.mInfoFields.putAll(infoFields);
this.mGenotypeFormatString = genotypeFormatString;
this.mGenotypeFields.addAll(genotypeObjects);
}
/**
* given a VCF header, and the values for each of the columns, create a VCF record.
*
* @param columnValues a mapping of header strings to values
*/
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
extractFields(columnValues);
mGenotypeFormatString = "";
this.mGenotypeRecords.addAll(genotypeObjects);
}
/**
@ -135,12 +148,12 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
position = Integer.valueOf(columnValues.get(val));
break;
case ID:
this.setID(columnValues.get(val));
setID(columnValues.get(val));
break;
case REF:
if (columnValues.get(val).length() != 1)
throw new IllegalArgumentException("Reference base should be a single character");
this.setReferenceBase(columnValues.get(val).charAt(0));
setReferenceBase(columnValues.get(val).charAt(0));
break;
case ALT:
String values[] = columnValues.get(val).split(",");
@ -148,19 +161,19 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
addAlternateBase(new VCFGenotypeEncoding(alt));
break;
case QUAL:
this.setQual(Double.valueOf(columnValues.get(val)));
setQual(Double.valueOf(columnValues.get(val)));
break;
case FILTER:
this.setFilterString(columnValues.get(val));
setFilterString(columnValues.get(val));
break;
case INFO:
String vals[] = columnValues.get(val).split(";");
for (String alt : vals) {
String keyVal[] = alt.split("=");
if ( keyVal.length == 1 )
this.addInfoField(keyVal[0], "");
addInfoField(keyVal[0], "");
else if (keyVal.length == 2)
this.addInfoField(keyVal[0], keyVal[1]);
addInfoField(keyVal[0], keyVal[1]);
else
throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt);
}
@ -177,21 +190,21 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
*/
public boolean hasGenotypeData() {
return (mGenotypeFields.size() > 0);
return (mGenotypeRecords.size() > 0);
}
/**
* @return this VCF record's location
*/
public GenomeLoc getLocation() {
return this.mLoc;
return mLoc;
}
/**
* @return the ID value for this record
*/
public String getID() {
return this.mID;
return mID == null ? EMPTY_ID_FIELD : mID;
}
/**
@ -225,11 +238,11 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
}
public List<VCFGenotypeEncoding> getAlternateAlleles() {
return this.mAlts;
return mAlts;
}
public boolean hasAlternateAllele() {
for ( VCFGenotypeEncoding alt : this.mAlts ) {
for ( VCFGenotypeEncoding alt : mAlts ) {
if ( alt.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
return true;
}
@ -325,14 +338,14 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
* @return the phred-scaled quality score
*/
public double getQual() {
return this.mQual;
return mQual;
}
/**
* @return the -log10PError
*/
public double getNegLog10PError() {
return this.mQual / 10.0;
return mQual / 10.0;
}
/**
@ -364,46 +377,46 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
* @return a map, of the info key-value pairs
*/
public Map<String, String> getInfoValues() {
if (this.mInfoFields.size() < 1) {
if (mInfoFields.size() < 1) {
Map<String, String> map = new HashMap<String, String>();
map.put(".", "");
return map;
}
return this.mInfoFields;
return mInfoFields;
}
/**
* @return the number of columnsof data we're storing
*/
public int getColumnCount() {
if (this.hasGenotypeData()) return mGenotypeFields.size() + VCFHeader.HEADER_FIELDS.values().length;
if (hasGenotypeData()) return mGenotypeRecords.size() + VCFHeader.HEADER_FIELDS.values().length;
return VCFHeader.HEADER_FIELDS.values().length;
}
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
ArrayList<VCFGenotypeRecord> list = new ArrayList<VCFGenotypeRecord>();
for ( Genotype g : mGenotypeFields )
for ( Genotype g : mGenotypeRecords )
list.add((VCFGenotypeRecord)g);
return list;
}
public List<Genotype> getGenotypes() {
return mGenotypeFields;
return mGenotypeRecords;
}
public Genotype getCalledGenotype() {
if ( mGenotypeFields == null || mGenotypeFields.size() != 1 )
if ( mGenotypeRecords == null || mGenotypeRecords.size() != 1 )
throw new IllegalStateException("There is not one and only one genotype associated with this record");
VCFGenotypeRecord record = (VCFGenotypeRecord)mGenotypeFields.get(0);
VCFGenotypeRecord record = (VCFGenotypeRecord)mGenotypeRecords.get(0);
if ( record.isEmptyGenotype() )
return null;
return record;
}
public boolean hasGenotype(DiploidGenotype x) {
if ( mGenotypeFields == null )
if ( mGenotypeRecords == null )
return false;
for ( Genotype g : mGenotypeFields ) {
for ( Genotype g : mGenotypeRecords ) {
if ( DiploidGenotype.valueOf(g.getBases()).equals(x) )
return true;
}
@ -414,9 +427,9 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
* @return a List of the sample names
*/
public String[] getSampleNames() {
String names[] = new String[mGenotypeFields.size()];
for (int i = 0; i < mGenotypeFields.size(); i++) {
VCFGenotypeRecord rec = (VCFGenotypeRecord)mGenotypeFields.get(i);
String names[] = new String[mGenotypeRecords.size()];
for (int i = 0; i < mGenotypeRecords.size(); i++) {
VCFGenotypeRecord rec = (VCFGenotypeRecord)mGenotypeRecords.get(i);
names[i] = rec.getSampleName();
}
return names;
@ -441,7 +454,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
referenceBase = (char) ((referenceBase > 96) ? referenceBase - 32 : referenceBase);
if (referenceBase != 'A' && referenceBase != 'C' && referenceBase != 'T' && referenceBase != 'G' && referenceBase != 'N')
throw new IllegalArgumentException("Reference base must be one of A,C,G,T,N, we saw: " + referenceBase);
this.mReferenceBase = referenceBase;
mReferenceBase = referenceBase;
}
public void setLocation(String chrom, int position) {
@ -449,25 +462,29 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
throw new IllegalArgumentException("Chromosomes cannot be missing");
if ( position < 0 )
throw new IllegalArgumentException("Position values must be greater than 0");
this.mLoc = GenomeLocParser.createGenomeLoc(chrom, position);
mLoc = GenomeLocParser.createGenomeLoc(chrom, position);
}
public void setID(String mID) {
this.mID = mID;
public void setLocation(GenomeLoc location) {
mLoc = location.clone();
}
public void setQual(double mQual) {
if (mQual < 0)
public void setID(String ID) {
mID = ID;
}
public void setQual(double qual) {
if (qual < 0)
throw new IllegalArgumentException("Qual values must be greater than 0");
this.mQual = mQual;
mQual = qual;
}
public void setFilterString(String mFilterString) {
this.mFilterString = mFilterString;
public void setFilterString(String filterString) {
mFilterString = filterString;
}
public void addGenotypeField(VCFGenotypeRecord mGenotypeFields) {
this.mGenotypeFields.add(mGenotypeFields);
public void addGenotypeRecord(VCFGenotypeRecord mGenotypeRecord) {
mGenotypeRecords.add(mGenotypeRecord);
}
/**
@ -488,7 +505,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
*/
public void addInfoField(String key, String value) {
//System.out.printf("Adding info field %s=%s%n", key, value);
this.mInfoFields.put(key, value);
mInfoFields.put(key, value);
// remove the empty token if it's present
if ( mInfoFields.containsKey(".") )
@ -496,7 +513,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
}
public void printInfoFields() {
for ( Map.Entry<String, String> e : this.mInfoFields.entrySet() ) {
for ( Map.Entry<String, String> e : mInfoFields.entrySet() ) {
System.out.printf(" Current info field %s=%s this=%s%n", e.getKey(), e.getValue(), this);
}
}
@ -542,7 +559,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
builder.append(getReferenceBase());
builder.append(FIELD_SEPERATOR);
String alts = "";
for (VCFGenotypeEncoding str : this.getAlternateAlleles()) alts += str.toString() + ",";
for (VCFGenotypeEncoding str : getAlternateAlleles()) alts += str.toString() + ",";
builder.append((alts.length() > 0) ? alts.substring(0, alts.length() - 1) + FIELD_SEPERATOR : "." + FIELD_SEPERATOR);
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, getQual()));
builder.append(FIELD_SEPERATOR);
@ -550,7 +567,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
builder.append(FIELD_SEPERATOR);
builder.append(createInfoString());
if ( this.hasGenotypeData() ) {
if ( hasGenotypeData() ) {
try {
addGenotypeData(builder, header);
} catch (Exception e) {
@ -569,7 +586,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
*/
protected String createInfoString() {
String info = "";
for (String str : this.getInfoValues().keySet()) {
for (String str : getInfoValues().keySet()) {
if (str.equals(EMPTY_INFO_FIELD))
return EMPTY_INFO_FIELD;
else
@ -597,7 +614,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
tempStr.append(FIELD_SEPERATOR);
if ( gMap.containsKey(genotype) ) {
VCFGenotypeRecord rec = gMap.get(genotype);
tempStr.append(rec.toStringEncoding(this.mAlts, genotypeFormatStrings));
tempStr.append(rec.toStringEncoding(mAlts, genotypeFormatStrings));
gMap.remove(genotype);
} else {
tempStr.append(VCFGenotypeRecord.EMPTY_GENOTYPE);
@ -628,7 +645,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
if ( other.mFilterString != null ) return false;
} else if ( !this.mFilterString.equals(other.mFilterString) ) return false;
if (!this.mInfoFields.equals(other.mInfoFields)) return false;
if (!this.mGenotypeFields.equals(other.mGenotypeFields)) return false;
if (!this.mGenotypeRecords.equals(other.mGenotypeRecords)) return false;
return true;
}

View File

@ -21,7 +21,7 @@ public class HapMap2VCFIntegrationTest extends WalkerTest {
@Test
public void testHapMap2VCF() {
List<String> md5 = new ArrayList<String>();
md5.add("5f525309d7bfb28b639cdf5d3e22ee3c");
md5.add("a4de8775433c8228eeac2c91d4d3737a");
WalkerTestSpec spec = new WalkerTestSpec(
"-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" +

View File

@ -92,7 +92,7 @@ public class VCFRecordTest extends BaseTest {
public void testGetGenotypes() {
Map<String, String> infoFields = new HashMap<String, String>();
VCFRecord rec = makeFakeVCFRecord(infoFields);
rec.addGenotypeField(createGenotype("sample2", "C", "A"));
rec.addGenotypeRecord(createGenotype("sample2", "C", "A"));
List<VCFGenotypeRecord> genotypeObjects = rec.getVCFGenotypeRecords();
Assert.assertEquals(2, genotypeObjects.size());
Assert.assertTrue(genotypeObjects.get(0).getSampleName().equals("sample1"));