diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapGenotypeROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapGenotypeROD.java index 222b0aac8..7ba9fdb64 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapGenotypeROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapGenotypeROD.java @@ -7,39 +7,29 @@ import org.broadinstitute.sting.utils.GenomeLocParser; public class HapMapGenotypeROD extends TabularROD { - public HapMapGenotypeROD(final String name) - { + public HapMapGenotypeROD(final String name) { super(name); } - public GenomeLoc getLocation() - { - //System.out.printf("chrom: %s; pos: %s\n", this.get("chrom"), this.get("pos")); - + public GenomeLoc getLocation() { // For converting from Hg18 to b36 format: // return GenomeLocParser.createGenomeLoc(this.get("chrom").replaceAll("chr", ""), Long.parseLong(this.get("pos"))); return GenomeLocParser.createGenomeLoc(this.get("chrom"), Long.parseLong(this.get("pos"))); } - public String[] getSampleIDs() - { - ArrayList header = this.getHeader(); + public String[] getSampleIDs() { + ArrayList header = getHeader(); String[] sample_ids = new String[header.size()-11]; for (int i = 11; i < header.size(); i++) - { sample_ids[i-11] = header.get(i); - } return sample_ids; } - public String[] getGenotypes() - { - ArrayList header = this.getHeader(); + public String[] getGenotypes() { + ArrayList header = getHeader(); String[] genotypes = new String[header.size()-11]; for (int i = 11; i < header.size(); i++) - { - genotypes[i-11] = this.get(header.get(i)); - } + genotypes[i-11] = get(header.get(i)); return genotypes; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 19617d334..ded30166d 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; @@ -219,17 +220,17 @@ public class RefMetaDataTracker { * * The name of each VariantContext corresponds to the ROD name. * - * @param curLocation - * @param allowedTypes - * @param requireStartHere - * @param takeFirstOnly - * @return + * @param curLocation location + * @param allowedTypes allowed types + * @param requireStartHere do we require the rod to start at this location? + * @param takeFirstOnly do we take the first rod only? + * @return variant context */ public Collection getAllVariantContexts(EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { List contexts = new ArrayList(); for ( RODRecordList rodList : getBoundRodTracks() ) { - addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly); + addVariantContexts(contexts, rodList, allowedTypes, curLocation, null, requireStartHere, takeFirstOnly); } return contexts; @@ -240,25 +241,33 @@ public class RefMetaDataTracker { * * see getVariantContexts for more information. * - * @param name - * @param curLocation - * @param allowedTypes - * @param requireStartHere - * @param takeFirstOnly - * @return + * @param name name + * @param curLocation location + * @param allowedTypes allowed types + * @param requireStartHere do we require the rod to start at this location? + * @param takeFirstOnly do we take the first rod only? + * @return variant context */ public Collection getVariantContexts(String name, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { - return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, requireStartHere, takeFirstOnly); + return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, null, requireStartHere, takeFirstOnly); + } + + public Collection getVariantContexts(String name, EnumSet allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) { + return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, ref, requireStartHere, takeFirstOnly); } public Collection getVariantContexts(Collection names, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { + return getVariantContexts(names, allowedTypes, curLocation, null, requireStartHere, takeFirstOnly); + } + + public Collection getVariantContexts(Collection names, EnumSet allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) { Collection contexts = new ArrayList(); for ( String name : names ) { RODRecordList rodList = getTrackData(name, null); if ( rodList != null ) - addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly ); + addVariantContexts(contexts, rodList, allowedTypes, curLocation, ref, requireStartHere, takeFirstOnly ); } return contexts; @@ -268,11 +277,11 @@ public class RefMetaDataTracker { * Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not. * see getVariantContexts for more information. * - * @param name - * @param curLocation - * @param allowedTypes - * @param requireStartHere - * @return + * @param name name + * @param curLocation location + * @param allowedTypes allowed types + * @param requireStartHere do we require the rod to start at this location? + * @return variant context */ public VariantContext getVariantContext(String name, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere ) { Collection contexts = getVariantContexts(name, allowedTypes, curLocation, requireStartHere, false ); @@ -285,11 +294,15 @@ public class RefMetaDataTracker { return contexts.iterator().next(); } - private void addVariantContexts(Collection contexts, RODRecordList rodList, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { + private void addVariantContexts(Collection contexts, RODRecordList rodList, EnumSet allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) { for ( ReferenceOrderedDatum rec : rodList ) { if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) { // ok, we might actually be able to turn this record in a variant context - VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec); + VariantContext vc; + if ( ref == null ) + vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec); + else + vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec, ref); if ( vc == null ) // sometimes the track has odd stuff in it that can't be converted continue; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 6e98193d3..039ac1e11 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -43,6 +43,7 @@ public class VariantContextAdaptors { adaptors.put(RodVCF.class, new RodVCFAdaptor()); adaptors.put(VCFRecord.class, new VCFRecordAdaptor()); adaptors.put(PlinkRod.class, new PlinkRodAdaptor()); + adaptors.put(HapMapGenotypeROD.class, new HapMapAdaptor()); adaptors.put(RodGLF.class, new GLFAdaptor()); adaptors.put(RodGeliText.class, new GeliAdaptor()); } @@ -568,4 +569,67 @@ public class VariantContextAdaptors { return null; // can't handle anything else } } + + // -------------------------------------------------------------------------------------------------------------- + // + // HapMap to VariantContext + // + // -------------------------------------------------------------------------------------------------------------- + + private static class HapMapAdaptor extends VCAdaptor { + /** + * convert to a Variant Context, given: + * @param name the name of the ROD + * @param input the Rod object, in this case a RodGeliText + * @return a VariantContext object + */ + VariantContext convert(String name, Object input) { + throw new UnsupportedOperationException("Conversion from HapMap to VariantContext requires knowledge of the reference allele"); + } + + /** + * convert to a Variant Context, given: + * @param name the name of the ROD + * @param input the Rod object, in this case a RodGeliText + * @param refAllele the reference base as an Allele object + * @return a VariantContext object + */ + VariantContext convert(String name, Object input, Allele refAllele) { + HapMapGenotypeROD hapmap = (HapMapGenotypeROD)input; + + // add the reference allele + HashSet alleles = new HashSet(); + alleles.add(refAllele); + + // make a mapping from sample to genotype + String[] samples = hapmap.getSampleIDs(); + String[] genotypeStrings = hapmap.getGenotypes(); + + Map genotypes = new HashMap(samples.length); + for ( int i = 0; i < samples.length; i++ ) { + // ignore bad genotypes + if ( genotypeStrings[i].contains("N") ) + continue; + + String a1 = genotypeStrings[i].substring(0,1); + String a2 = genotypeStrings[i].substring(1); + + Allele allele1 = new Allele(a1, refAllele.basesMatch(a1)); + Allele allele2 = new Allele(a2, refAllele.basesMatch(a2)); + + ArrayList myAlleles = new ArrayList(2); + myAlleles.add(allele1); + myAlleles.add(allele2); + alleles.add(allele1); + alleles.add(allele2); + + Genotype g = new Genotype(samples[i], myAlleles); + genotypes.put(samples[i], g); + } + + VariantContext vc = new VariantContext(name, hapmap.getLocation(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, new HashMap()); + vc.validate(); + return vc; + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java deleted file mode 100755 index 90753ca7a..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCF.java +++ /dev/null @@ -1,126 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.HapMapGenotypeROD; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.genotype.vcf.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.util.*; -import java.io.File; - -/** - * Converts HapMap genotype information to VCF format - */ -public class HapMap2VCF extends RodWalker { - - @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true) - protected File VCF_OUT; - - private VCFWriter vcfWriter; - String[] sample_ids = null; - - public void initialize() { - vcfWriter = new VCFWriter(VCF_OUT); - } - - /** - * For each HapMap record, generate and print the VCF record string - * Output the VCF header if this is the first record - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - for ( ReferenceOrderedDatum rod : tracker.getAllRods() ) { - if ( rod instanceof HapMapGenotypeROD ) { - HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod; - - // If this is the first time map is called, we need to fill out the sample_ids from the rod - if (sample_ids == null) { - // ensure that there are no duplicate sample IDs - sample_ids = hapmap_rod.getSampleIDs(); - Set sample_id_set = new LinkedHashSet(Arrays.asList(sample_ids)); - if (sample_id_set.size() != sample_ids.length) - throw new IllegalStateException("Sample set passed into HapMap2VCF has repeated sample IDs"); - - // setup the header fields - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("source", "HapMap2VCF")); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - // write out the header once - vcfWriter.writeHeader(new VCFHeader(hInfo, sample_id_set)); - } - - // Get reference base - Character ref_allele = ref.getBase(); - VCFGenotypeEncoding refAllele = new VCFGenotypeEncoding(ref_allele.toString()); - - // Create a new record - VCFRecord record = new VCFRecord(Character.toString(ref_allele), context.getLocation(), "GT"); - - // Record each sample's genotype info - String[] hapmap_rod_genotypes = hapmap_rod.getGenotypes(); - for (int i = 0; i < hapmap_rod_genotypes.length; i++) { - String sample_id = sample_ids[i]; - String hapmap_rod_genotype = hapmap_rod_genotypes[i]; - - // for each sample, set the genotype if it exists - if (!hapmap_rod_genotype.contains("N")) { - List alleles = new ArrayList(); - VCFGenotypeEncoding allele1 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(0,1)); - VCFGenotypeEncoding allele2 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(1)); - alleles.add(allele1); - alleles.add(allele2); - - VCFGenotypeRecord genotype = new VCFGenotypeRecord(sample_id, alleles, VCFGenotypeRecord.PHASE.UNPHASED); - record.addGenotypeRecord(genotype); - if ( !allele1.equals(refAllele) ) - record.addAlternateBase(allele1); - if ( !allele2.equals(refAllele) ) - record.addAlternateBase(allele2); - } - } - - // Add dbsnp ID - rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); - if (dbsnp != null) - record.setID(dbsnp.getRS_ID()); - - // Write the VCF record - vcfWriter.addRecord(record); - } - } - - return 1; - } - - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { - return 0; - } - - /** - * Increment the number of rods processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of rods processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer value) {} - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java index 1052cc017..cabbbb9ed 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java @@ -3,7 +3,9 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RMD; @@ -15,9 +17,11 @@ import java.util.*; /** * Converts variants from other file formats to VCF format. */ -@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class)) +@Requires(value={},referenceMetaData=@RMD(name=VariantsToVCF.INPUT_ROD_NAME,type= ReferenceOrderedDatum.class)) public class VariantsToVCF extends RodWalker { + public static final String INPUT_ROD_NAME = "variant"; + @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false) protected String sampleName = null; @@ -34,37 +38,50 @@ public class VariantsToVCF extends RodWalker { rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); - Collection contexts = tracker.getVariantContexts("variant", ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false); + Allele refAllele = new Allele(Character.toString(ref.getBase()), true); + Collection contexts = tracker.getVariantContexts(INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), refAllele, true, false); + for ( VariantContext vc : contexts ) { VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false); if ( dbsnp != null ) vcf.setID(dbsnp.getRS_ID()); - if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype("variant") != null ) - vcf.getGenotype("variant").setSampleName(sampleName); - writeRecord(vcf); + // set the appropriate sample name if necessary + if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype(INPUT_ROD_NAME) != null ) + vcf.getGenotype(INPUT_ROD_NAME).setSampleName(sampleName); + writeRecord(vcf, tracker); } return 1; } - private void writeRecord(VCFRecord rec) { + private void writeRecord(VCFRecord rec, RefMetaDataTracker tracker) { if ( vcfwriter == null ) { // setup the header fields Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("source", "VariantFiltration")); + hInfo.add(new VCFHeaderLine("source", "VariantsToVCF")); hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - vcfwriter = new VCFWriter(out); - TreeSet samples = new TreeSet(); - if ( sampleName != null ) + if ( sampleName != null ) { samples.add(sampleName); - else - samples.addAll(Arrays.asList(rec.getSampleNames())); + } else { + RODRecordList rods = tracker.getTrackData(INPUT_ROD_NAME, null); + if ( rods.size() == 0 ) + throw new IllegalStateException("VCF record was created, but no rod data is present"); + + ReferenceOrderedDatum rod = rods.get(0); + if ( rod instanceof RodVCF ) + samples.addAll(Arrays.asList(((RodVCF)rod).getSampleNames())); + else if ( rod instanceof HapMapGenotypeROD ) + samples.addAll(Arrays.asList(((HapMapGenotypeROD)rod).getSampleIDs())); + else + samples.addAll(Arrays.asList(rec.getSampleNames())); + } + + vcfwriter = new VCFWriter(out); vcfwriter.writeHeader(new VCFHeader(hInfo, samples)); - } vcfwriter.addRecord(rec); } @@ -78,6 +95,7 @@ public class VariantsToVCF extends RodWalker { } public void onTraversalDone(Integer sum) { - vcfwriter.close(); + if ( vcfwriter != null ) + vcfwriter.close(); } } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java deleted file mode 100755 index d9bbe28a8..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/HapMap2VCFIntegrationTest.java +++ /dev/null @@ -1,37 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf; - -import org.broadinstitute.sting.WalkerTest; -import org.junit.Test; - -import java.util.List; -import java.util.ArrayList; -import java.io.File; - - -/** - * @author aaron - *

- * Class VariantsToVCFIntegrationTest - *

- * test(s) for the VariantsToVCF walker. - */ -public class HapMap2VCFIntegrationTest extends WalkerTest { - - - @Test - public void testHapMap2VCF() { - List md5 = new ArrayList(); - md5.add("a4de8775433c8228eeac2c91d4d3737a"); - - WalkerTestSpec spec = new WalkerTestSpec( - "-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" + - " --rodBind HapMapChip,HapMapGenotype," + validationDataLocation + "hapmap_phase_ii+iii_genotypes_chrX_YRI_r27_nr.hapmap" + - " -T HapMap2VCF" + - " -L chrX:1-1,000,000" + - " --vcfOutput %s", - 1, // just one output file - md5); - List result = executeTest("testHapMap2VCFUsingGeliInput", spec).getFirst(); - } - -} diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCFIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCFIntegrationTest.java index 0516eedca..f12c27870 100755 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCFIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCFIntegrationTest.java @@ -20,7 +20,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testVariantsToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("8f32f0efed5d0233cf9292198f4f01d8"); + md5.add("211be63cf93cddf021e5b0eb9341b386"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + oneKGLocation + "reference/human_b36_both.fasta" + @@ -37,7 +37,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("40cc4d04d9a50043ce1322ea2650c453"); + md5.add("b31446fa91b8ed82ad73a5a5a72700a7"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + oneKGLocation + "reference/human_b36_both.fasta" + @@ -51,4 +51,19 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst(); } + @Test + public void testGenotypesToVCFUsingHapMapInput() { + List md5 = new ArrayList(); + md5.add("03ff126faf5751a83bd7ab9e020bce7e"); + + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + oneKGLocation + "reference/human_b36_both.fasta" + + " -B variant,HapMapGenotype," + validationDataLocation + "rawHapMap.yri.chr1.txt" + + " -T VariantsToVCF" + + " -L 1:1-1,000,000" + + " -o %s", + 1, // just one output file + md5); + executeTest("testVariantsToVCFUsingHapMapInput", spec).getFirst(); + } }