From 040fdfee61ecbe01450f6f37e2634346597b076d Mon Sep 17 00:00:00 2001 From: ebanks Date: Mon, 11 Jan 2010 21:42:12 +0000 Subject: [PATCH] Cleaned up the interface to VCFRecord. It's now possible (and easy) to create records and then write them with a VCFWriter. I've updated HapMap2VCF to use the new interface; Chris agreed to take care of Sequenom2VCF. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2558 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variantstovcf/HapMap2VCF.java | 46 +++--- .../sting/utils/genotype/vcf/VCFRecord.java | 135 ++++++++++-------- .../HapMap2VCFIntegrationTest.java | 2 +- .../utils/genotype/vcf/VCFRecordTest.java | 2 +- 4 files changed, 104 insertions(+), 81 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java index b48d38c39..90bcdae92 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java @@ -8,12 +8,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.genotype.vcf.*; -import org.broadinstitute.sting.utils.genotype.Variation; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; -import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.VariationCall; import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.GenomeLoc; import java.util.*; import java.io.File; @@ -26,11 +21,11 @@ public class HapMap2VCF extends RodWalker { @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true) protected File VCF_OUT; - private VCFGenotypeWriterAdapter vcfWriter; + private VCFWriter vcfWriter; String[] sample_ids = null; public void initialize() { - vcfWriter = new VCFGenotypeWriterAdapter(VCF_OUT); + vcfWriter = new VCFWriter(VCF_OUT); } /** @@ -43,7 +38,6 @@ public class HapMap2VCF extends RodWalker { for ( ReferenceOrderedDatum rod : tracker.getAllRods() ) { if ( rod instanceof HapMapGenotypeROD ) { - GenomeLoc loc = context.getLocation(); HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod; // If this is the first time map is called, we need to fill out the sample_ids from the rod @@ -58,37 +52,49 @@ public class HapMap2VCF extends RodWalker { Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.add(new VCFHeaderLine("source", "HapMap2VCF")); - hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName())); + hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); // write out the header once - vcfWriter.writeHeader(sample_id_set, hInfo); + vcfWriter.writeHeader(new VCFHeader(hInfo, sample_id_set)); } // Get reference base Character ref_allele = ref.getBase(); + VCFGenotypeEncoding refAllele = new VCFGenotypeEncoding(ref_allele.toString()); - // Print each sample's genotype info - List genotype_calls = new ArrayList(); + // Create a new record + VCFRecord record = new VCFRecord(ref_allele, context.getLocation(), "GT"); + + // Record each sample's genotype info String[] hapmap_rod_genotypes = hapmap_rod.getGenotypes(); for (int i = 0; i < hapmap_rod_genotypes.length; i++) { String sample_id = sample_ids[i]; String hapmap_rod_genotype = hapmap_rod_genotypes[i]; // for each sample, set the genotype if it exists - VCFGenotypeCall genotype = new VCFGenotypeCall(ref_allele, loc); if (!hapmap_rod_genotype.contains("N")) { - genotype.setGenotype(DiploidGenotype.createDiploidGenotype(hapmap_rod_genotype.charAt(0), hapmap_rod_genotype.charAt(1))); - genotype.setSampleName(sample_id); - genotype_calls.add(genotype); + List alleles = new ArrayList(); + VCFGenotypeEncoding allele1 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(0,1)); + VCFGenotypeEncoding allele2 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(1)); + alleles.add(allele1); + alleles.add(allele2); + + VCFGenotypeRecord genotype = new VCFGenotypeRecord(sample_id, alleles, VCFGenotypeRecord.PHASE.UNPHASED); + record.addGenotypeRecord(genotype); + if ( !allele1.equals(refAllele) ) + record.addAlternateBase(allele1); + if ( !allele2.equals(refAllele) ) + record.addAlternateBase(allele2); } } - // add each genotype to VCF record and write it - VCFVariationCall call = new VCFVariationCall(ref_allele, loc, Variation.VARIANT_TYPE.SNP); + // Add dbsnp ID rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); if (dbsnp != null) - call.setID(dbsnp.getRS_ID()); - vcfWriter.addMultiSampleCall(genotype_calls, call); + record.setID(dbsnp.getRS_ID()); + + // Write the VCF record + vcfWriter.addRecord(record); } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 10b2c3e89..fdbb45ed3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -56,10 +56,23 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { private final String mGenotypeFormatString; // our genotype sample fields - private final List mGenotypeFields = new ArrayList(); + private final List mGenotypeRecords = new ArrayList(); /** - * given a VCF header, and the values for each of the columns, create a VCF record. + * given a reference base, a location, and the format string, create a VCF record. + * + * @param referenceBase the reference base to use + * @param location our genomic location + * @param genotypeFormatString the format string + */ + public VCFRecord(char referenceBase, GenomeLoc location, String genotypeFormatString) { + setReferenceBase(referenceBase); + setLocation(location); + mGenotypeFormatString = genotypeFormatString; + } + + /** + * given the values for each of the columns, create a VCF record. * * @param columnValues a mapping of header strings to values * @param formatString the format string for the genotype records @@ -67,10 +80,20 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { */ public VCFRecord(Map columnValues, String formatString, List genotypeRecords) { extractFields(columnValues); - mGenotypeFields.addAll(genotypeRecords); + mGenotypeRecords.addAll(genotypeRecords); mGenotypeFormatString = formatString; } + /** + * given the values for each of the columns, create a VCF record. + * + * @param columnValues a mapping of header strings to values + */ + public VCFRecord(Map columnValues) { + extractFields(columnValues); + mGenotypeFormatString = ""; + } + /** * create a VCF record * @@ -95,8 +118,8 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { Map infoFields, String genotypeFormatString, List genotypeObjects) { - this.setReferenceBase(referenceBase); - this.setLocation(contig, position); + setReferenceBase(referenceBase); + setLocation(contig, position); this.mID = ID; for (VCFGenotypeEncoding alt : altBases) this.addAlternateBase(alt); @@ -104,17 +127,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { this.setFilterString(filters); this.mInfoFields.putAll(infoFields); this.mGenotypeFormatString = genotypeFormatString; - this.mGenotypeFields.addAll(genotypeObjects); - } - - /** - * given a VCF header, and the values for each of the columns, create a VCF record. - * - * @param columnValues a mapping of header strings to values - */ - public VCFRecord(Map columnValues) { - extractFields(columnValues); - mGenotypeFormatString = ""; + this.mGenotypeRecords.addAll(genotypeObjects); } /** @@ -135,12 +148,12 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { position = Integer.valueOf(columnValues.get(val)); break; case ID: - this.setID(columnValues.get(val)); + setID(columnValues.get(val)); break; case REF: if (columnValues.get(val).length() != 1) throw new IllegalArgumentException("Reference base should be a single character"); - this.setReferenceBase(columnValues.get(val).charAt(0)); + setReferenceBase(columnValues.get(val).charAt(0)); break; case ALT: String values[] = columnValues.get(val).split(","); @@ -148,19 +161,19 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { addAlternateBase(new VCFGenotypeEncoding(alt)); break; case QUAL: - this.setQual(Double.valueOf(columnValues.get(val))); + setQual(Double.valueOf(columnValues.get(val))); break; case FILTER: - this.setFilterString(columnValues.get(val)); + setFilterString(columnValues.get(val)); break; case INFO: String vals[] = columnValues.get(val).split(";"); for (String alt : vals) { String keyVal[] = alt.split("="); if ( keyVal.length == 1 ) - this.addInfoField(keyVal[0], ""); + addInfoField(keyVal[0], ""); else if (keyVal.length == 2) - this.addInfoField(keyVal[0], keyVal[1]); + addInfoField(keyVal[0], keyVal[1]); else throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt); } @@ -177,21 +190,21 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { */ public boolean hasGenotypeData() { - return (mGenotypeFields.size() > 0); + return (mGenotypeRecords.size() > 0); } /** * @return this VCF record's location */ public GenomeLoc getLocation() { - return this.mLoc; + return mLoc; } /** * @return the ID value for this record */ public String getID() { - return this.mID; + return mID == null ? EMPTY_ID_FIELD : mID; } /** @@ -225,11 +238,11 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { } public List getAlternateAlleles() { - return this.mAlts; + return mAlts; } public boolean hasAlternateAllele() { - for ( VCFGenotypeEncoding alt : this.mAlts ) { + for ( VCFGenotypeEncoding alt : mAlts ) { if ( alt.getType() != VCFGenotypeEncoding.TYPE.UNCALLED ) return true; } @@ -325,14 +338,14 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { * @return the phred-scaled quality score */ public double getQual() { - return this.mQual; + return mQual; } /** * @return the -log10PError */ public double getNegLog10PError() { - return this.mQual / 10.0; + return mQual / 10.0; } /** @@ -364,46 +377,46 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { * @return a map, of the info key-value pairs */ public Map getInfoValues() { - if (this.mInfoFields.size() < 1) { + if (mInfoFields.size() < 1) { Map map = new HashMap(); map.put(".", ""); return map; } - return this.mInfoFields; + return mInfoFields; } /** * @return the number of columnsof data we're storing */ public int getColumnCount() { - if (this.hasGenotypeData()) return mGenotypeFields.size() + VCFHeader.HEADER_FIELDS.values().length; + if (hasGenotypeData()) return mGenotypeRecords.size() + VCFHeader.HEADER_FIELDS.values().length; return VCFHeader.HEADER_FIELDS.values().length; } public List getVCFGenotypeRecords() { ArrayList list = new ArrayList(); - for ( Genotype g : mGenotypeFields ) + for ( Genotype g : mGenotypeRecords ) list.add((VCFGenotypeRecord)g); return list; } public List getGenotypes() { - return mGenotypeFields; + return mGenotypeRecords; } public Genotype getCalledGenotype() { - if ( mGenotypeFields == null || mGenotypeFields.size() != 1 ) + if ( mGenotypeRecords == null || mGenotypeRecords.size() != 1 ) throw new IllegalStateException("There is not one and only one genotype associated with this record"); - VCFGenotypeRecord record = (VCFGenotypeRecord)mGenotypeFields.get(0); + VCFGenotypeRecord record = (VCFGenotypeRecord)mGenotypeRecords.get(0); if ( record.isEmptyGenotype() ) return null; return record; } public boolean hasGenotype(DiploidGenotype x) { - if ( mGenotypeFields == null ) + if ( mGenotypeRecords == null ) return false; - for ( Genotype g : mGenotypeFields ) { + for ( Genotype g : mGenotypeRecords ) { if ( DiploidGenotype.valueOf(g.getBases()).equals(x) ) return true; } @@ -414,9 +427,9 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { * @return a List of the sample names */ public String[] getSampleNames() { - String names[] = new String[mGenotypeFields.size()]; - for (int i = 0; i < mGenotypeFields.size(); i++) { - VCFGenotypeRecord rec = (VCFGenotypeRecord)mGenotypeFields.get(i); + String names[] = new String[mGenotypeRecords.size()]; + for (int i = 0; i < mGenotypeRecords.size(); i++) { + VCFGenotypeRecord rec = (VCFGenotypeRecord)mGenotypeRecords.get(i); names[i] = rec.getSampleName(); } return names; @@ -441,7 +454,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { referenceBase = (char) ((referenceBase > 96) ? referenceBase - 32 : referenceBase); if (referenceBase != 'A' && referenceBase != 'C' && referenceBase != 'T' && referenceBase != 'G' && referenceBase != 'N') throw new IllegalArgumentException("Reference base must be one of A,C,G,T,N, we saw: " + referenceBase); - this.mReferenceBase = referenceBase; + mReferenceBase = referenceBase; } public void setLocation(String chrom, int position) { @@ -449,25 +462,29 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { throw new IllegalArgumentException("Chromosomes cannot be missing"); if ( position < 0 ) throw new IllegalArgumentException("Position values must be greater than 0"); - this.mLoc = GenomeLocParser.createGenomeLoc(chrom, position); + mLoc = GenomeLocParser.createGenomeLoc(chrom, position); } - public void setID(String mID) { - this.mID = mID; + public void setLocation(GenomeLoc location) { + mLoc = location.clone(); } - public void setQual(double mQual) { - if (mQual < 0) + public void setID(String ID) { + mID = ID; + } + + public void setQual(double qual) { + if (qual < 0) throw new IllegalArgumentException("Qual values must be greater than 0"); - this.mQual = mQual; + mQual = qual; } - public void setFilterString(String mFilterString) { - this.mFilterString = mFilterString; + public void setFilterString(String filterString) { + mFilterString = filterString; } - public void addGenotypeField(VCFGenotypeRecord mGenotypeFields) { - this.mGenotypeFields.add(mGenotypeFields); + public void addGenotypeRecord(VCFGenotypeRecord mGenotypeRecord) { + mGenotypeRecords.add(mGenotypeRecord); } /** @@ -488,7 +505,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { */ public void addInfoField(String key, String value) { //System.out.printf("Adding info field %s=%s%n", key, value); - this.mInfoFields.put(key, value); + mInfoFields.put(key, value); // remove the empty token if it's present if ( mInfoFields.containsKey(".") ) @@ -496,7 +513,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { } public void printInfoFields() { - for ( Map.Entry e : this.mInfoFields.entrySet() ) { + for ( Map.Entry e : mInfoFields.entrySet() ) { System.out.printf(" Current info field %s=%s this=%s%n", e.getKey(), e.getValue(), this); } } @@ -542,7 +559,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { builder.append(getReferenceBase()); builder.append(FIELD_SEPERATOR); String alts = ""; - for (VCFGenotypeEncoding str : this.getAlternateAlleles()) alts += str.toString() + ","; + for (VCFGenotypeEncoding str : getAlternateAlleles()) alts += str.toString() + ","; builder.append((alts.length() > 0) ? alts.substring(0, alts.length() - 1) + FIELD_SEPERATOR : "." + FIELD_SEPERATOR); builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, getQual())); builder.append(FIELD_SEPERATOR); @@ -550,7 +567,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { builder.append(FIELD_SEPERATOR); builder.append(createInfoString()); - if ( this.hasGenotypeData() ) { + if ( hasGenotypeData() ) { try { addGenotypeData(builder, header); } catch (Exception e) { @@ -569,7 +586,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { */ protected String createInfoString() { String info = ""; - for (String str : this.getInfoValues().keySet()) { + for (String str : getInfoValues().keySet()) { if (str.equals(EMPTY_INFO_FIELD)) return EMPTY_INFO_FIELD; else @@ -597,7 +614,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { tempStr.append(FIELD_SEPERATOR); if ( gMap.containsKey(genotype) ) { VCFGenotypeRecord rec = gMap.get(genotype); - tempStr.append(rec.toStringEncoding(this.mAlts, genotypeFormatStrings)); + tempStr.append(rec.toStringEncoding(mAlts, genotypeFormatStrings)); gMap.remove(genotype); } else { tempStr.append(VCFGenotypeRecord.EMPTY_GENOTYPE); @@ -628,7 +645,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { if ( other.mFilterString != null ) return false; } else if ( !this.mFilterString.equals(other.mFilterString) ) return false; if (!this.mInfoFields.equals(other.mInfoFields)) return false; - if (!this.mGenotypeFields.equals(other.mGenotypeFields)) return false; + if (!this.mGenotypeRecords.equals(other.mGenotypeRecords)) return false; return true; } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java index 52000762a..d9bbe28a8 100755 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java @@ -21,7 +21,7 @@ public class HapMap2VCFIntegrationTest extends WalkerTest { @Test public void testHapMap2VCF() { List md5 = new ArrayList(); - md5.add("5f525309d7bfb28b639cdf5d3e22ee3c"); + md5.add("a4de8775433c8228eeac2c91d4d3737a"); WalkerTestSpec spec = new WalkerTestSpec( "-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" + diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java index 70a6599ad..ddeb0503b 100755 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFRecordTest.java @@ -92,7 +92,7 @@ public class VCFRecordTest extends BaseTest { public void testGetGenotypes() { Map infoFields = new HashMap(); VCFRecord rec = makeFakeVCFRecord(infoFields); - rec.addGenotypeField(createGenotype("sample2", "C", "A")); + rec.addGenotypeRecord(createGenotype("sample2", "C", "A")); List genotypeObjects = rec.getVCFGenotypeRecords(); Assert.assertEquals(2, genotypeObjects.size()); Assert.assertTrue(genotypeObjects.get(0).getSampleName().equals("sample1"));