diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java index 016568172..79863e020 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java @@ -6,17 +6,31 @@ import org.broadinstitute.sting.gatk.refdata.HapMapGenotypeROD; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.genotype.Variation; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.VariationCall; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.GenomeLoc; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; +import java.util.*; +import java.io.File; /** * Converts HapMap genotype information to VCF format */ public class HapMap2VCF extends RodWalker { - String[] sample_ids; + @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true) + protected File VCF_OUT; + + private VCFGenotypeWriterAdapter vcfWriter; + String[] sample_ids = null; + + public void initialize() { + vcfWriter = new VCFGenotypeWriterAdapter(VCF_OUT); + } /** * For each HapMap record, generate and print the VCF record string @@ -26,112 +40,64 @@ public class HapMap2VCF extends RodWalker { if ( tracker == null ) return 0; - Iterator rods = tracker.getAllRods().iterator(); - boolean first_hapmap_rod = true; - while ( rods.hasNext() ) { - ReferenceOrderedDatum rod = rods.next(); + for ( ReferenceOrderedDatum rod : tracker.getAllRods() ) { if ( rod instanceof HapMapGenotypeROD ) { HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod; - if (first_hapmap_rod) { - out.println(VCFHeaderString(hapmap_rod.getSampleIDs())); - first_hapmap_rod = false; + + // If this is the first time map is called, we need to fill out the sample_ids from the rod + if (sample_ids == null) { + // ensure that there are no duplicate sample IDs + sample_ids = hapmap_rod.getSampleIDs(); + Set sample_id_set = new LinkedHashSet(Arrays.asList(sample_ids)); + if (sample_id_set.size() != sample_ids.length) + throw new IllegalStateException("Sample set passed into HapMap2VCF has repeated sample IDs"); + + // setup the header fields + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("source", "HapMap2VCF")); + hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName())); + + // write out the header once + vcfWriter.writeHeader(sample_id_set, hInfo); } - // Get reference and alternate bases - - Character alt_allele, ref_allele; - - Set observed_alleles = getObservedAlleles (hapmap_rod.getGenotypes()); - if (observed_alleles.contains('N')) - observed_alleles.remove('N'); - ref_allele = ref.getBase(); - if (observed_alleles.contains(ref_allele)) - observed_alleles.remove(ref_allele); - if (observed_alleles.isEmpty()) { - alt_allele = ref_allele; // ## todo: Confirm that alt allele becomes ref allele - }else{ - if (observed_alleles.size() != 1) { - out.println("Error: more than 2 alleles found in hapmap chip position"); - alt_allele = 'N'; - System.exit(1); - }else{ - alt_allele = observed_alleles.iterator().next(); - } - } - - // Print all position specific info - String vcf = hapmap_rod.get("chrom")+"\t"+ - hapmap_rod.get("pos")+"\t"+ - hapmap_rod.get("rs#")+"\t"+ - ref_allele+"\t"+ - alt_allele+"\t"+ - "99\t0\t.\tGT:GQ"; - out.print(vcf); + // Get reference base and locus + Character ref_allele = ref.getBase(); + GenomeLoc loc = hapmap_rod.getLocation(); + VCFVariationCall variation = new VCFVariationCall(ref_allele, loc, Variation.VARIANT_TYPE.SNP); // Print each sample's genotype info + List genotype_calls = new ArrayList(); + String[] hapmap_rod_genotypes = hapmap_rod.getGenotypes(); + for (int i = 0; i < hapmap_rod_genotypes.length; i++) { + String sample_id = sample_ids[i]; + String hapmap_rod_genotype = hapmap_rod_genotypes[i]; - String allele_strings = ""; - for (String genotype : hapmap_rod.getGenotypes()) { - String allele_str = ""; // one allele string - Integer GQ = 99; - for (Character allele : genotype.toCharArray()) { - if (allele == ref_allele) { - allele_str += "0"; - }else{ - if (allele == alt_allele) { - allele_str += "1"; - }else{ - if (allele == 'N') { - allele_str += "0"; - GQ = 0; - }else{ - out.println("ERROR: Unexpected tri-allelic site detected"); - System.exit(1); - } - } - } - if (allele_str.length() == 1) { - allele_str += "/"; - } + // for each sample, set the genotype if it exists + VCFGenotypeCall genotype = new VCFGenotypeCall(ref_allele, loc); + if (!hapmap_rod_genotype.contains("N")) { + genotype.setGenotype(DiploidGenotype.createDiploidGenotype(hapmap_rod_genotype.charAt(0), hapmap_rod_genotype.charAt(1))); + genotype.setSampleName(sample_id); + genotype_calls.add(genotype); } - allele_strings += "\t" + allele_str+":"+GQ; } - out.println(allele_strings); + // add each genotype to VCF record and write it + variation.setGenotypeCalls(genotype_calls); + vcfWriter.addMultiSampleCall(genotype_calls, new VCFVariationCall(ref_allele, loc, Variation.VARIANT_TYPE.SNP)); } } return 1; } - public static Set getObservedAlleles (String[] genotypes) { - Set observed_alleles = new HashSet(); - for (String genotype : genotypes) - for (Character allele : genotype.toCharArray()) - if (!observed_alleles.contains(allele)) - observed_alleles.add(allele); - - return observed_alleles; - } - - public String VCFHeaderString (String[] sample_ids) { - String header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; - for (String sample_id : sample_ids) - header += "\t" + sample_id; - - return header; - } - /** * Initialize the number of loci processed to zero. * * @return 0 */ public Integer reduceInit() { - out.println("#fileformat=VCFv3.3"); - out.println("##reference=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"); - out.println("##source=VariantsToVCF"); - return 0; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFParameters.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFParameters.java index 92da3b308..eca90ff84 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFParameters.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFParameters.java @@ -66,7 +66,7 @@ class VCFParameters { public void addAlternateBase(VCFGenotypeEncoding base) { if ( !alternateBases.contains(base) && - !base.toString().equals(String.valueOf(getReferenceBase())) && + !base.toString().equals(String.valueOf(getReferenceBase()).toUpperCase()) && !base.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) ) alternateBases.add(base); } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java new file mode 100755 index 000000000..ffcb00fef --- /dev/null +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java @@ -0,0 +1,37 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf; + +import org.broadinstitute.sting.WalkerTest; +import org.junit.Test; + +import java.util.List; +import java.util.ArrayList; +import java.io.File; + + +/** + * @author aaron + *

+ * Class VariantsToVCFIntegrationTest + *

+ * test(s) for the VariantsToVCF walker. + */ +public class HapMap2VCFIntegrationTest extends WalkerTest { + + + @Test + public void testHapMap2VCF() { + List md5 = new ArrayList(); + md5.add("4d36df142bbd3d446baec6213771800a"); + + WalkerTestSpec spec = new WalkerTestSpec( + "-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" + + " --rodBind HapMapChip,HapMapGenotype," + validationDataLocation + "hapmap_phase_ii+iii_genotypes_chrX_YRI_r27_nr.hapmap" + + " -T HapMap2VCF" + + " -L chrX:1-1,000,000" + + " --vcfOutput %s", + 1, // just one output file + md5); + List result = executeTest("testHapMap2VCFUsingGeliInput", spec).getFirst(); + } + +} \ No newline at end of file