HapMap-to-VCF now works fine within Variants-to-VCF. Added integration test for it and removed old code.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3077 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
78af6d5a40
commit
14bf6923a8
|
|
@ -7,39 +7,29 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
||||||
public class HapMapGenotypeROD extends TabularROD
|
public class HapMapGenotypeROD extends TabularROD
|
||||||
{
|
{
|
||||||
public HapMapGenotypeROD(final String name)
|
public HapMapGenotypeROD(final String name) {
|
||||||
{
|
|
||||||
super(name);
|
super(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GenomeLoc getLocation()
|
public GenomeLoc getLocation() {
|
||||||
{
|
|
||||||
//System.out.printf("chrom: %s; pos: %s\n", this.get("chrom"), this.get("pos"));
|
|
||||||
|
|
||||||
// For converting from Hg18 to b36 format:
|
// For converting from Hg18 to b36 format:
|
||||||
// return GenomeLocParser.createGenomeLoc(this.get("chrom").replaceAll("chr", ""), Long.parseLong(this.get("pos")));
|
// return GenomeLocParser.createGenomeLoc(this.get("chrom").replaceAll("chr", ""), Long.parseLong(this.get("pos")));
|
||||||
return GenomeLocParser.createGenomeLoc(this.get("chrom"), Long.parseLong(this.get("pos")));
|
return GenomeLocParser.createGenomeLoc(this.get("chrom"), Long.parseLong(this.get("pos")));
|
||||||
}
|
}
|
||||||
|
|
||||||
public String[] getSampleIDs()
|
public String[] getSampleIDs() {
|
||||||
{
|
ArrayList<String> header = getHeader();
|
||||||
ArrayList<String> header = this.getHeader();
|
|
||||||
String[] sample_ids = new String[header.size()-11];
|
String[] sample_ids = new String[header.size()-11];
|
||||||
for (int i = 11; i < header.size(); i++)
|
for (int i = 11; i < header.size(); i++)
|
||||||
{
|
|
||||||
sample_ids[i-11] = header.get(i);
|
sample_ids[i-11] = header.get(i);
|
||||||
}
|
|
||||||
return sample_ids;
|
return sample_ids;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String[] getGenotypes()
|
public String[] getGenotypes() {
|
||||||
{
|
ArrayList<String> header = getHeader();
|
||||||
ArrayList<String> header = this.getHeader();
|
|
||||||
String[] genotypes = new String[header.size()-11];
|
String[] genotypes = new String[header.size()-11];
|
||||||
for (int i = 11; i < header.size(); i++)
|
for (int i = 11; i < header.size(); i++)
|
||||||
{
|
genotypes[i-11] = get(header.get(i));
|
||||||
genotypes[i-11] = this.get(header.get(i));
|
|
||||||
}
|
|
||||||
return genotypes;
|
return genotypes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
@ -219,17 +220,17 @@ public class RefMetaDataTracker {
|
||||||
*
|
*
|
||||||
* The name of each VariantContext corresponds to the ROD name.
|
* The name of each VariantContext corresponds to the ROD name.
|
||||||
*
|
*
|
||||||
* @param curLocation
|
* @param curLocation location
|
||||||
* @param allowedTypes
|
* @param allowedTypes allowed types
|
||||||
* @param requireStartHere
|
* @param requireStartHere do we require the rod to start at this location?
|
||||||
* @param takeFirstOnly
|
* @param takeFirstOnly do we take the first rod only?
|
||||||
* @return
|
* @return variant context
|
||||||
*/
|
*/
|
||||||
public Collection<VariantContext> getAllVariantContexts(EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
public Collection<VariantContext> getAllVariantContexts(EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
||||||
List<VariantContext> contexts = new ArrayList<VariantContext>();
|
List<VariantContext> contexts = new ArrayList<VariantContext>();
|
||||||
|
|
||||||
for ( RODRecordList rodList : getBoundRodTracks() ) {
|
for ( RODRecordList rodList : getBoundRodTracks() ) {
|
||||||
addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly);
|
addVariantContexts(contexts, rodList, allowedTypes, curLocation, null, requireStartHere, takeFirstOnly);
|
||||||
}
|
}
|
||||||
|
|
||||||
return contexts;
|
return contexts;
|
||||||
|
|
@ -240,25 +241,33 @@ public class RefMetaDataTracker {
|
||||||
*
|
*
|
||||||
* see getVariantContexts for more information.
|
* see getVariantContexts for more information.
|
||||||
*
|
*
|
||||||
* @param name
|
* @param name name
|
||||||
* @param curLocation
|
* @param curLocation location
|
||||||
* @param allowedTypes
|
* @param allowedTypes allowed types
|
||||||
* @param requireStartHere
|
* @param requireStartHere do we require the rod to start at this location?
|
||||||
* @param takeFirstOnly
|
* @param takeFirstOnly do we take the first rod only?
|
||||||
* @return
|
* @return variant context
|
||||||
*/
|
*/
|
||||||
public Collection<VariantContext> getVariantContexts(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
public Collection<VariantContext> getVariantContexts(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
||||||
return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, requireStartHere, takeFirstOnly);
|
return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, null, requireStartHere, takeFirstOnly);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Collection<VariantContext> getVariantContexts(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) {
|
||||||
|
return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, ref, requireStartHere, takeFirstOnly);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Collection<VariantContext> getVariantContexts(Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
public Collection<VariantContext> getVariantContexts(Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
||||||
|
return getVariantContexts(names, allowedTypes, curLocation, null, requireStartHere, takeFirstOnly);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Collection<VariantContext> getVariantContexts(Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) {
|
||||||
Collection<VariantContext> contexts = new ArrayList<VariantContext>();
|
Collection<VariantContext> contexts = new ArrayList<VariantContext>();
|
||||||
|
|
||||||
for ( String name : names ) {
|
for ( String name : names ) {
|
||||||
RODRecordList rodList = getTrackData(name, null);
|
RODRecordList rodList = getTrackData(name, null);
|
||||||
|
|
||||||
if ( rodList != null )
|
if ( rodList != null )
|
||||||
addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly );
|
addVariantContexts(contexts, rodList, allowedTypes, curLocation, ref, requireStartHere, takeFirstOnly );
|
||||||
}
|
}
|
||||||
|
|
||||||
return contexts;
|
return contexts;
|
||||||
|
|
@ -268,11 +277,11 @@ public class RefMetaDataTracker {
|
||||||
* Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not.
|
* Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not.
|
||||||
* see getVariantContexts for more information.
|
* see getVariantContexts for more information.
|
||||||
*
|
*
|
||||||
* @param name
|
* @param name name
|
||||||
* @param curLocation
|
* @param curLocation location
|
||||||
* @param allowedTypes
|
* @param allowedTypes allowed types
|
||||||
* @param requireStartHere
|
* @param requireStartHere do we require the rod to start at this location?
|
||||||
* @return
|
* @return variant context
|
||||||
*/
|
*/
|
||||||
public VariantContext getVariantContext(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere ) {
|
public VariantContext getVariantContext(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere ) {
|
||||||
Collection<VariantContext> contexts = getVariantContexts(name, allowedTypes, curLocation, requireStartHere, false );
|
Collection<VariantContext> contexts = getVariantContexts(name, allowedTypes, curLocation, requireStartHere, false );
|
||||||
|
|
@ -285,11 +294,15 @@ public class RefMetaDataTracker {
|
||||||
return contexts.iterator().next();
|
return contexts.iterator().next();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addVariantContexts(Collection<VariantContext> contexts, RODRecordList rodList, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
private void addVariantContexts(Collection<VariantContext> contexts, RODRecordList rodList, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, Allele ref, boolean requireStartHere, boolean takeFirstOnly ) {
|
||||||
for ( ReferenceOrderedDatum rec : rodList ) {
|
for ( ReferenceOrderedDatum rec : rodList ) {
|
||||||
if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) {
|
if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) {
|
||||||
// ok, we might actually be able to turn this record in a variant context
|
// ok, we might actually be able to turn this record in a variant context
|
||||||
VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec);
|
VariantContext vc;
|
||||||
|
if ( ref == null )
|
||||||
|
vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec);
|
||||||
|
else
|
||||||
|
vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec, ref);
|
||||||
|
|
||||||
if ( vc == null ) // sometimes the track has odd stuff in it that can't be converted
|
if ( vc == null ) // sometimes the track has odd stuff in it that can't be converted
|
||||||
continue;
|
continue;
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,7 @@ public class VariantContextAdaptors {
|
||||||
adaptors.put(RodVCF.class, new RodVCFAdaptor());
|
adaptors.put(RodVCF.class, new RodVCFAdaptor());
|
||||||
adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
|
adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
|
||||||
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
|
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
|
||||||
|
adaptors.put(HapMapGenotypeROD.class, new HapMapAdaptor());
|
||||||
adaptors.put(RodGLF.class, new GLFAdaptor());
|
adaptors.put(RodGLF.class, new GLFAdaptor());
|
||||||
adaptors.put(RodGeliText.class, new GeliAdaptor());
|
adaptors.put(RodGeliText.class, new GeliAdaptor());
|
||||||
}
|
}
|
||||||
|
|
@ -568,4 +569,67 @@ public class VariantContextAdaptors {
|
||||||
return null; // can't handle anything else
|
return null; // can't handle anything else
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// HapMap to VariantContext
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private static class HapMapAdaptor extends VCAdaptor {
|
||||||
|
/**
|
||||||
|
* convert to a Variant Context, given:
|
||||||
|
* @param name the name of the ROD
|
||||||
|
* @param input the Rod object, in this case a RodGeliText
|
||||||
|
* @return a VariantContext object
|
||||||
|
*/
|
||||||
|
VariantContext convert(String name, Object input) {
|
||||||
|
throw new UnsupportedOperationException("Conversion from HapMap to VariantContext requires knowledge of the reference allele");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* convert to a Variant Context, given:
|
||||||
|
* @param name the name of the ROD
|
||||||
|
* @param input the Rod object, in this case a RodGeliText
|
||||||
|
* @param refAllele the reference base as an Allele object
|
||||||
|
* @return a VariantContext object
|
||||||
|
*/
|
||||||
|
VariantContext convert(String name, Object input, Allele refAllele) {
|
||||||
|
HapMapGenotypeROD hapmap = (HapMapGenotypeROD)input;
|
||||||
|
|
||||||
|
// add the reference allele
|
||||||
|
HashSet<Allele> alleles = new HashSet<Allele>();
|
||||||
|
alleles.add(refAllele);
|
||||||
|
|
||||||
|
// make a mapping from sample to genotype
|
||||||
|
String[] samples = hapmap.getSampleIDs();
|
||||||
|
String[] genotypeStrings = hapmap.getGenotypes();
|
||||||
|
|
||||||
|
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(samples.length);
|
||||||
|
for ( int i = 0; i < samples.length; i++ ) {
|
||||||
|
// ignore bad genotypes
|
||||||
|
if ( genotypeStrings[i].contains("N") )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
String a1 = genotypeStrings[i].substring(0,1);
|
||||||
|
String a2 = genotypeStrings[i].substring(1);
|
||||||
|
|
||||||
|
Allele allele1 = new Allele(a1, refAllele.basesMatch(a1));
|
||||||
|
Allele allele2 = new Allele(a2, refAllele.basesMatch(a2));
|
||||||
|
|
||||||
|
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
|
||||||
|
myAlleles.add(allele1);
|
||||||
|
myAlleles.add(allele2);
|
||||||
|
alleles.add(allele1);
|
||||||
|
alleles.add(allele2);
|
||||||
|
|
||||||
|
Genotype g = new Genotype(samples[i], myAlleles);
|
||||||
|
genotypes.put(samples[i], g);
|
||||||
|
}
|
||||||
|
|
||||||
|
VariantContext vc = new VariantContext(name, hapmap.getLocation(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, new HashMap<String, String>());
|
||||||
|
vc.validate();
|
||||||
|
return vc;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,126 +0,0 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.HapMapGenotypeROD;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|
||||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.io.File;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Converts HapMap genotype information to VCF format
|
|
||||||
*/
|
|
||||||
public class HapMap2VCF extends RodWalker<Integer, Integer> {
|
|
||||||
|
|
||||||
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
|
|
||||||
protected File VCF_OUT;
|
|
||||||
|
|
||||||
private VCFWriter vcfWriter;
|
|
||||||
String[] sample_ids = null;
|
|
||||||
|
|
||||||
public void initialize() {
|
|
||||||
vcfWriter = new VCFWriter(VCF_OUT);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* For each HapMap record, generate and print the VCF record string
|
|
||||||
* Output the VCF header if this is the first record
|
|
||||||
*/
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
|
||||||
if ( tracker == null )
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
for ( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
|
|
||||||
if ( rod instanceof HapMapGenotypeROD ) {
|
|
||||||
HapMapGenotypeROD hapmap_rod = (HapMapGenotypeROD) rod;
|
|
||||||
|
|
||||||
// If this is the first time map is called, we need to fill out the sample_ids from the rod
|
|
||||||
if (sample_ids == null) {
|
|
||||||
// ensure that there are no duplicate sample IDs
|
|
||||||
sample_ids = hapmap_rod.getSampleIDs();
|
|
||||||
Set<String> sample_id_set = new LinkedHashSet<String>(Arrays.asList(sample_ids));
|
|
||||||
if (sample_id_set.size() != sample_ids.length)
|
|
||||||
throw new IllegalStateException("Sample set passed into HapMap2VCF has repeated sample IDs");
|
|
||||||
|
|
||||||
// setup the header fields
|
|
||||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
|
||||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
|
||||||
hInfo.add(new VCFHeaderLine("source", "HapMap2VCF"));
|
|
||||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
|
||||||
|
|
||||||
// write out the header once
|
|
||||||
vcfWriter.writeHeader(new VCFHeader(hInfo, sample_id_set));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get reference base
|
|
||||||
Character ref_allele = ref.getBase();
|
|
||||||
VCFGenotypeEncoding refAllele = new VCFGenotypeEncoding(ref_allele.toString());
|
|
||||||
|
|
||||||
// Create a new record
|
|
||||||
VCFRecord record = new VCFRecord(Character.toString(ref_allele), context.getLocation(), "GT");
|
|
||||||
|
|
||||||
// Record each sample's genotype info
|
|
||||||
String[] hapmap_rod_genotypes = hapmap_rod.getGenotypes();
|
|
||||||
for (int i = 0; i < hapmap_rod_genotypes.length; i++) {
|
|
||||||
String sample_id = sample_ids[i];
|
|
||||||
String hapmap_rod_genotype = hapmap_rod_genotypes[i];
|
|
||||||
|
|
||||||
// for each sample, set the genotype if it exists
|
|
||||||
if (!hapmap_rod_genotype.contains("N")) {
|
|
||||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
|
||||||
VCFGenotypeEncoding allele1 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(0,1));
|
|
||||||
VCFGenotypeEncoding allele2 = new VCFGenotypeEncoding(hapmap_rod_genotype.substring(1));
|
|
||||||
alleles.add(allele1);
|
|
||||||
alleles.add(allele2);
|
|
||||||
|
|
||||||
VCFGenotypeRecord genotype = new VCFGenotypeRecord(sample_id, alleles, VCFGenotypeRecord.PHASE.UNPHASED);
|
|
||||||
record.addGenotypeRecord(genotype);
|
|
||||||
if ( !allele1.equals(refAllele) )
|
|
||||||
record.addAlternateBase(allele1);
|
|
||||||
if ( !allele2.equals(refAllele) )
|
|
||||||
record.addAlternateBase(allele2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add dbsnp ID
|
|
||||||
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
|
||||||
if (dbsnp != null)
|
|
||||||
record.setID(dbsnp.getRS_ID());
|
|
||||||
|
|
||||||
// Write the VCF record
|
|
||||||
vcfWriter.addRecord(record);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Initialize the number of loci processed to zero.
|
|
||||||
*
|
|
||||||
* @return 0
|
|
||||||
*/
|
|
||||||
public Integer reduceInit() {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Increment the number of rods processed.
|
|
||||||
*
|
|
||||||
* @param value result of the map.
|
|
||||||
* @param sum accumulator for the reduce.
|
|
||||||
* @return the new number of rods processed.
|
|
||||||
*/
|
|
||||||
public Integer reduce(Integer value, Integer sum) {
|
|
||||||
return sum + value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void onTraversalDone(Integer value) {}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -3,7 +3,9 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.gatk.refdata.*;
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||||
|
|
@ -15,9 +17,11 @@ import java.util.*;
|
||||||
/**
|
/**
|
||||||
* Converts variants from other file formats to VCF format.
|
* Converts variants from other file formats to VCF format.
|
||||||
*/
|
*/
|
||||||
@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class))
|
@Requires(value={},referenceMetaData=@RMD(name=VariantsToVCF.INPUT_ROD_NAME,type= ReferenceOrderedDatum.class))
|
||||||
public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
|
public static final String INPUT_ROD_NAME = "variant";
|
||||||
|
|
||||||
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
|
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
|
||||||
protected String sampleName = null;
|
protected String sampleName = null;
|
||||||
|
|
||||||
|
|
@ -34,37 +38,50 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
||||||
|
|
||||||
Collection<VariantContext> contexts = tracker.getVariantContexts("variant", ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false);
|
Allele refAllele = new Allele(Character.toString(ref.getBase()), true);
|
||||||
|
Collection<VariantContext> contexts = tracker.getVariantContexts(INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), refAllele, true, false);
|
||||||
|
|
||||||
for ( VariantContext vc : contexts ) {
|
for ( VariantContext vc : contexts ) {
|
||||||
VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false);
|
VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false);
|
||||||
if ( dbsnp != null )
|
if ( dbsnp != null )
|
||||||
vcf.setID(dbsnp.getRS_ID());
|
vcf.setID(dbsnp.getRS_ID());
|
||||||
if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype("variant") != null )
|
// set the appropriate sample name if necessary
|
||||||
vcf.getGenotype("variant").setSampleName(sampleName);
|
if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype(INPUT_ROD_NAME) != null )
|
||||||
writeRecord(vcf);
|
vcf.getGenotype(INPUT_ROD_NAME).setSampleName(sampleName);
|
||||||
|
writeRecord(vcf, tracker);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeRecord(VCFRecord rec) {
|
private void writeRecord(VCFRecord rec, RefMetaDataTracker tracker) {
|
||||||
if ( vcfwriter == null ) {
|
if ( vcfwriter == null ) {
|
||||||
// setup the header fields
|
// setup the header fields
|
||||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||||
hInfo.add(new VCFHeaderLine("source", "VariantFiltration"));
|
hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
|
||||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||||
|
|
||||||
vcfwriter = new VCFWriter(out);
|
|
||||||
|
|
||||||
TreeSet<String> samples = new TreeSet<String>();
|
TreeSet<String> samples = new TreeSet<String>();
|
||||||
if ( sampleName != null )
|
if ( sampleName != null ) {
|
||||||
samples.add(sampleName);
|
samples.add(sampleName);
|
||||||
else
|
} else {
|
||||||
samples.addAll(Arrays.asList(rec.getSampleNames()));
|
|
||||||
|
|
||||||
|
RODRecordList rods = tracker.getTrackData(INPUT_ROD_NAME, null);
|
||||||
|
if ( rods.size() == 0 )
|
||||||
|
throw new IllegalStateException("VCF record was created, but no rod data is present");
|
||||||
|
|
||||||
|
ReferenceOrderedDatum rod = rods.get(0);
|
||||||
|
if ( rod instanceof RodVCF )
|
||||||
|
samples.addAll(Arrays.asList(((RodVCF)rod).getSampleNames()));
|
||||||
|
else if ( rod instanceof HapMapGenotypeROD )
|
||||||
|
samples.addAll(Arrays.asList(((HapMapGenotypeROD)rod).getSampleIDs()));
|
||||||
|
else
|
||||||
|
samples.addAll(Arrays.asList(rec.getSampleNames()));
|
||||||
|
}
|
||||||
|
|
||||||
|
vcfwriter = new VCFWriter(out);
|
||||||
vcfwriter.writeHeader(new VCFHeader(hInfo, samples));
|
vcfwriter.writeHeader(new VCFHeader(hInfo, samples));
|
||||||
|
|
||||||
}
|
}
|
||||||
vcfwriter.addRecord(rec);
|
vcfwriter.addRecord(rec);
|
||||||
}
|
}
|
||||||
|
|
@ -78,6 +95,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone(Integer sum) {
|
public void onTraversalDone(Integer sum) {
|
||||||
vcfwriter.close();
|
if ( vcfwriter != null )
|
||||||
|
vcfwriter.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.WalkerTest;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.io.File;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author aaron
|
|
||||||
* <p/>
|
|
||||||
* Class VariantsToVCFIntegrationTest
|
|
||||||
* <p/>
|
|
||||||
* test(s) for the VariantsToVCF walker.
|
|
||||||
*/
|
|
||||||
public class HapMap2VCFIntegrationTest extends WalkerTest {
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testHapMap2VCF() {
|
|
||||||
List<String> md5 = new ArrayList<String>();
|
|
||||||
md5.add("a4de8775433c8228eeac2c91d4d3737a");
|
|
||||||
|
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
|
||||||
"-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" +
|
|
||||||
" --rodBind HapMapChip,HapMapGenotype," + validationDataLocation + "hapmap_phase_ii+iii_genotypes_chrX_YRI_r27_nr.hapmap" +
|
|
||||||
" -T HapMap2VCF" +
|
|
||||||
" -L chrX:1-1,000,000" +
|
|
||||||
" --vcfOutput %s",
|
|
||||||
1, // just one output file
|
|
||||||
md5);
|
|
||||||
List<File> result = executeTest("testHapMap2VCFUsingGeliInput", spec).getFirst();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -20,7 +20,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testVariantsToVCFUsingGeliInput() {
|
public void testVariantsToVCFUsingGeliInput() {
|
||||||
List<String> md5 = new ArrayList<String>();
|
List<String> md5 = new ArrayList<String>();
|
||||||
md5.add("8f32f0efed5d0233cf9292198f4f01d8");
|
md5.add("211be63cf93cddf021e5b0eb9341b386");
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||||
|
|
@ -37,7 +37,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testGenotypesToVCFUsingGeliInput() {
|
public void testGenotypesToVCFUsingGeliInput() {
|
||||||
List<String> md5 = new ArrayList<String>();
|
List<String> md5 = new ArrayList<String>();
|
||||||
md5.add("40cc4d04d9a50043ce1322ea2650c453");
|
md5.add("b31446fa91b8ed82ad73a5a5a72700a7");
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||||
|
|
@ -51,4 +51,19 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
|
||||||
executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst();
|
executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGenotypesToVCFUsingHapMapInput() {
|
||||||
|
List<String> md5 = new ArrayList<String>();
|
||||||
|
md5.add("03ff126faf5751a83bd7ab9e020bce7e");
|
||||||
|
|
||||||
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||||
|
" -B variant,HapMapGenotype," + validationDataLocation + "rawHapMap.yri.chr1.txt" +
|
||||||
|
" -T VariantsToVCF" +
|
||||||
|
" -L 1:1-1,000,000" +
|
||||||
|
" -o %s",
|
||||||
|
1, // just one output file
|
||||||
|
md5);
|
||||||
|
executeTest("testVariantsToVCFUsingHapMapInput", spec).getFirst();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue