HapMap-to-VCF now works fine within Variants-to-VCF. Added integration test for it and removed old code.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3077 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-26 18:34:59 +00:00
parent 78af6d5a40
commit 14bf6923a8
7 changed files with 154 additions and 217 deletions

View File

@ -7,39 +7,29 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
public class HapMapGenotypeROD extends TabularROD
{
public HapMapGenotypeROD(final String name)
{
public HapMapGenotypeROD(final String name) {
super(name);
}
public GenomeLoc getLocation()
{
//System.out.printf("chrom: %s; pos: %s\n", this.get("chrom"), this.get("pos"));
public GenomeLoc getLocation() {
// For converting from Hg18 to b36 format:
// return GenomeLocParser.createGenomeLoc(this.get("chrom").replaceAll("chr", ""), Long.parseLong(this.get("pos")));
return GenomeLocParser.createGenomeLoc(this.get("chrom"), Long.parseLong(this.get("pos")));
}
public String[] getSampleIDs()
{
ArrayList<String> header = this.getHeader();
public String[] getSampleIDs() {
ArrayList<String> header = getHeader();
String[] sample_ids = new String[header.size()-11];
for (int i = 11; i < header.size(); i++)
{
sample_ids[i-11] = header.get(i);
}
return sample_ids;
}
public String[] getGenotypes()
{
ArrayList<String> header = this.getHeader();
public String[] getGenotypes() {
ArrayList<String> header = getHeader();
String[] genotypes = new String[header.size()-11];
for (int i = 11; i < header.size(); i++)
{
genotypes[i-11] = this.get(header.get(i));
}
genotypes[i-11] = get(header.get(i));
return genotypes;
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.refdata;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
@ -219,17 +220,17 @@ public class RefMetaDataTracker {
*
* The name of each VariantContext corresponds to the ROD name.
*
* @param curLocation
* @param allowedTypes
* @param requireStartHere
* @param takeFirstOnly
* @return
* @param curLocation location
* @param allowedTypes allowed types
* @param requireStartHere do we require the rod to start at this location?
* @param takeFirstOnly do we take the first rod only?
* @return variant context
*/
public Collection<VariantContext> getAllVariantContexts(EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
List<VariantContext> contexts = new ArrayList<VariantContext>();
for ( RODRecordList rodList : getBoundRodTracks() ) {
addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly);
addVariantContexts(contexts, rodList, allowedTypes, curLocation, null, requireStartHere, takeFirstOnly);
}
return contexts;
@ -240,25 +241,33 @@ public class RefMetaDataTracker {
*
* see getVariantContexts for more information.
*
* @param name
* @param curLocation
* @param allowedTypes
* @param requireStartHere
* @param takeFirstOnly
* @return
* @param name name
* @param curLocation location
* @param allowedTypes allowed types
* @param requireStartHere do we require the rod to start at this location?
* @param takeFirstOnly do we take the first rod only?
* @return variant context
*/
public Collection<VariantContext> getVariantContexts(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, requireStartHere, takeFirstOnly);
return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, null, requireStartHere, takeFirstOnly);
}
public Collection<VariantContext> getVariantContexts(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) {
return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, ref, requireStartHere, takeFirstOnly);
}
public Collection<VariantContext> getVariantContexts(Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
return getVariantContexts(names, allowedTypes, curLocation, null, requireStartHere, takeFirstOnly);
}
public Collection<VariantContext> getVariantContexts(Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) {
Collection<VariantContext> contexts = new ArrayList<VariantContext>();
for ( String name : names ) {
RODRecordList rodList = getTrackData(name, null);
if ( rodList != null )
addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly );
addVariantContexts(contexts, rodList, allowedTypes, curLocation, ref, requireStartHere, takeFirstOnly );
}
return contexts;
@ -268,11 +277,11 @@ public class RefMetaDataTracker {
* Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not.
* see getVariantContexts for more information.
*
* @param name
* @param curLocation
* @param allowedTypes
* @param requireStartHere
* @return
* @param name name
* @param curLocation location
* @param allowedTypes allowed types
* @param requireStartHere do we require the rod to start at this location?
* @return variant context
*/
public VariantContext getVariantContext(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere ) {
Collection<VariantContext> contexts = getVariantContexts(name, allowedTypes, curLocation, requireStartHere, false );
@ -285,11 +294,15 @@ public class RefMetaDataTracker {
return contexts.iterator().next();
}
private void addVariantContexts(Collection<VariantContext> contexts, RODRecordList rodList, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
private void addVariantContexts(Collection<VariantContext> contexts, RODRecordList rodList, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) {
for ( ReferenceOrderedDatum rec : rodList ) {
if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) {
// ok, we might actually be able to turn this record in a variant context
VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec);
VariantContext vc;
if ( ref == null )
vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec);
else
vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec, ref);
if ( vc == null ) // sometimes the track has odd stuff in it that can't be converted
continue;

View File

@ -43,6 +43,7 @@ public class VariantContextAdaptors {
adaptors.put(RodVCF.class, new RodVCFAdaptor());
adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
adaptors.put(HapMapGenotypeROD.class, new HapMapAdaptor());
adaptors.put(RodGLF.class, new GLFAdaptor());
adaptors.put(RodGeliText.class, new GeliAdaptor());
}
@ -568,4 +569,67 @@ public class VariantContextAdaptors {
return null; // can't handle anything else
}
}
// --------------------------------------------------------------------------------------------------------------
//
// HapMap to VariantContext
//
// --------------------------------------------------------------------------------------------------------------
private static class HapMapAdaptor extends VCAdaptor {
/**
* convert to a Variant Context, given:
* @param name the name of the ROD
* @param input the Rod object, in this case a RodGeliText
* @return a VariantContext object
*/
VariantContext convert(String name, Object input) {
throw new UnsupportedOperationException("Conversion from HapMap to VariantContext requires knowledge of the reference allele");
}
/**
* convert to a Variant Context, given:
* @param name the name of the ROD
* @param input the Rod object, in this case a RodGeliText
* @param refAllele the reference base as an Allele object
* @return a VariantContext object
*/
VariantContext convert(String name, Object input, Allele refAllele) {
HapMapGenotypeROD hapmap = (HapMapGenotypeROD)input;
// add the reference allele
HashSet<Allele> alleles = new HashSet<Allele>();
alleles.add(refAllele);
// make a mapping from sample to genotype
String[] samples = hapmap.getSampleIDs();
String[] genotypeStrings = hapmap.getGenotypes();
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(samples.length);
for ( int i = 0; i < samples.length; i++ ) {
// ignore bad genotypes
if ( genotypeStrings[i].contains("N") )
continue;
String a1 = genotypeStrings[i].substring(0,1);
String a2 = genotypeStrings[i].substring(1);
Allele allele1 = new Allele(a1, refAllele.basesMatch(a1));
Allele allele2 = new Allele(a2, refAllele.basesMatch(a2));
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
myAlleles.add(allele1);
myAlleles.add(allele2);
alleles.add(allele1);
alleles.add(allele2);
Genotype g = new Genotype(samples[i], myAlleles);
genotypes.put(samples[i], g);
}
VariantContext vc = new VariantContext(name, hapmap.getLocation(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, new HashMap<String, String>());
vc.validate();
return vc;
}
}
}

View File

@ -1,126 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.HapMapGenotypeROD;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
import java.io.File;
/**
* Converts HapMap genotype information to VCF format
*/
public class HapMap2VCF extends RodWalker<Integer, Integer> {
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
protected File VCF_OUT;
private VCFWriter vcfWriter;
String[] sample_ids = null;
public void initialize() {
vcfWriter = new VCFWriter(VCF_OUT);
}
/**
* For each HapMap record, generate and print the VCF record string
* Output the VCF header if this is the first record
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
for ( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
if ( rod instanceof HapMapGenotypeROD ) {
HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod;
// If this is the first time map is called, we need to fill out the sample_ids from the rod
if (sample_ids == null) {
// ensure that there are no duplicate sample IDs
sample_ids = hapmap_rod.getSampleIDs();
Set<String> sample_id_set = new LinkedHashSet<String>(Arrays.asList(sample_ids));
if (sample_id_set.size() != sample_ids.length)
throw new IllegalStateException("Sample set passed into HapMap2VCF has repeated sample IDs");
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "HapMap2VCF"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
// write out the header once
vcfWriter.writeHeader(new VCFHeader(hInfo, sample_id_set));
}
// Get reference base
Character ref_allele = ref.getBase();
VCFGenotypeEncoding refAllele = new VCFGenotypeEncoding(ref_allele.toString());
// Create a new record
VCFRecord record = new VCFRecord(Character.toString(ref_allele), context.getLocation(), "GT");
// Record each sample's genotype info
String[] hapmap_rod_genotypes = hapmap_rod.getGenotypes();
for (int i = 0; i < hapmap_rod_genotypes.length; i++) {
String sample_id = sample_ids[i];
String hapmap_rod_genotype = hapmap_rod_genotypes[i];
// for each sample, set the genotype if it exists
if (!hapmap_rod_genotype.contains("N")) {
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
VCFGenotypeEncoding allele1 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(0,1));
VCFGenotypeEncoding allele2 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(1));
alleles.add(allele1);
alleles.add(allele2);
VCFGenotypeRecord genotype = new VCFGenotypeRecord(sample_id, alleles, VCFGenotypeRecord.PHASE.UNPHASED);
record.addGenotypeRecord(genotype);
if ( !allele1.equals(refAllele) )
record.addAlternateBase(allele1);
if ( !allele2.equals(refAllele) )
record.addAlternateBase(allele2);
}
}
// Add dbsnp ID
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
if (dbsnp != null)
record.setID(dbsnp.getRS_ID());
// Write the VCF record
vcfWriter.addRecord(record);
}
}
return 1;
}
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() {
return 0;
}
/**
* Increment the number of rods processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of rods processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer value) {}
}

View File

@ -3,7 +3,9 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RMD;
@ -15,9 +17,11 @@ import java.util.*;
/**
* Converts variants from other file formats to VCF format.
*/
@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class))
@Requires(value={},referenceMetaData=@RMD(name=VariantsToVCF.INPUT_ROD_NAME,type= ReferenceOrderedDatum.class))
public class VariantsToVCF extends RodWalker<Integer, Integer> {
public static final String INPUT_ROD_NAME = "variant";
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
protected String sampleName = null;
@ -34,37 +38,50 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
Collection<VariantContext> contexts = tracker.getVariantContexts("variant", ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false);
Allele refAllele = new Allele(Character.toString(ref.getBase()), true);
Collection<VariantContext> contexts = tracker.getVariantContexts(INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), refAllele, true, false);
for ( VariantContext vc : contexts ) {
VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false);
if ( dbsnp != null )
vcf.setID(dbsnp.getRS_ID());
if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype("variant") != null )
vcf.getGenotype("variant").setSampleName(sampleName);
writeRecord(vcf);
// set the appropriate sample name if necessary
if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype(INPUT_ROD_NAME) != null )
vcf.getGenotype(INPUT_ROD_NAME).setSampleName(sampleName);
writeRecord(vcf, tracker);
}
return 1;
}
private void writeRecord(VCFRecord rec) {
private void writeRecord(VCFRecord rec, RefMetaDataTracker tracker) {
if ( vcfwriter == null ) {
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "VariantFiltration"));
hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
vcfwriter = new VCFWriter(out);
TreeSet<String> samples = new TreeSet<String>();
if ( sampleName != null )
if ( sampleName != null ) {
samples.add(sampleName);
else
samples.addAll(Arrays.asList(rec.getSampleNames()));
} else {
RODRecordList rods = tracker.getTrackData(INPUT_ROD_NAME, null);
if ( rods.size() == 0 )
throw new IllegalStateException("VCF record was created, but no rod data is present");
ReferenceOrderedDatum rod = rods.get(0);
if ( rod instanceof RodVCF )
samples.addAll(Arrays.asList(((RodVCF)rod).getSampleNames()));
else if ( rod instanceof HapMapGenotypeROD )
samples.addAll(Arrays.asList(((HapMapGenotypeROD)rod).getSampleIDs()));
else
samples.addAll(Arrays.asList(rec.getSampleNames()));
}
vcfwriter = new VCFWriter(out);
vcfwriter.writeHeader(new VCFHeader(hInfo, samples));
}
vcfwriter.addRecord(rec);
}
@ -78,6 +95,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
}
public void onTraversalDone(Integer sum) {
vcfwriter.close();
if ( vcfwriter != null )
vcfwriter.close();
}
}

View File

@ -1,37 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.List;
import java.util.ArrayList;
import java.io.File;
/**
* @author aaron
* <p/>
* Class VariantsToVCFIntegrationTest
* <p/>
* test(s) for the VariantsToVCF walker.
*/
public class HapMap2VCFIntegrationTest extends WalkerTest {
@Test
public void testHapMap2VCF() {
List<String> md5 = new ArrayList<String>();
md5.add("a4de8775433c8228eeac2c91d4d3737a");
WalkerTestSpec spec = new WalkerTestSpec(
"-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" +
" --rodBind HapMapChip,HapMapGenotype," + validationDataLocation + "hapmap_phase_ii+iii_genotypes_chrX_YRI_r27_nr.hapmap" +
" -T HapMap2VCF" +
" -L chrX:1-1,000,000" +
" --vcfOutput %s",
1, // just one output file
md5);
List<File> result = executeTest("testHapMap2VCFUsingGeliInput", spec).getFirst();
}
}

View File

@ -20,7 +20,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testVariantsToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("8f32f0efed5d0233cf9292198f4f01d8");
md5.add("211be63cf93cddf021e5b0eb9341b386");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
@ -37,7 +37,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("40cc4d04d9a50043ce1322ea2650c453");
md5.add("b31446fa91b8ed82ad73a5a5a72700a7");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
@ -51,4 +51,19 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst();
}
@Test
public void testGenotypesToVCFUsingHapMapInput() {
List<String> md5 = new ArrayList<String>();
md5.add("03ff126faf5751a83bd7ab9e020bce7e");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" -B variant,HapMapGenotype," + validationDataLocation + "rawHapMap.yri.chr1.txt" +
" -T VariantsToVCF" +
" -L 1:1-1,000,000" +
" -o %s",
1, // just one output file
md5);
executeTest("testVariantsToVCFUsingHapMapInput", spec).getFirst();
}
}