diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index bbcce6677..395723f27 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -11,7 +11,6 @@ import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.utils.*; import java.util.*; @@ -218,10 +217,21 @@ public class VariantContextAdaptors { HapMapFeature hapmap = (HapMapFeature)input; - // add the reference allele HashSet alleles = new HashSet(); - Allele refAllele = Allele.create(ref.getBase(), true); - alleles.add(refAllele); + Allele refSNPAllele = Allele.create(ref.getBase(), true); + int deletionLength = -1; + + Map alleleMap = hapmap.getActualAlleles(); + // use the actual alleles, if available + if ( alleleMap != null ) { + alleles.addAll(alleleMap.values()); + Allele deletionAllele = alleleMap.get(HapMapFeature.INSERTION); // yes, use insertion here (since we want the reference bases) + if ( deletionAllele != null && deletionAllele.isReference() ) + deletionLength = deletionAllele.length(); + } else { + // add the reference allele for SNPs + alleles.add(refSNPAllele); + } // make a mapping from sample to genotype String[] samples = hapmap.getSampleIDs(); @@ -230,26 +240,42 @@ public class VariantContextAdaptors { Map genotypes = new HashMap(samples.length); for ( int i = 0; i < samples.length; i++ ) { // ignore bad genotypes - if ( genotypeStrings[i].contains("N") || genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") ) + if ( genotypeStrings[i].contains("N") ) continue; String a1 = genotypeStrings[i].substring(0,1); String a2 = genotypeStrings[i].substring(1); - - Allele allele1 = Allele.create(a1, refAllele.basesMatch(a1)); - Allele allele2 = Allele.create(a2, refAllele.basesMatch(a2)); - ArrayList myAlleles = new ArrayList(2); - myAlleles.add(allele1); - myAlleles.add(allele2); - alleles.add(allele1); - alleles.add(allele2); + + // use the mapping to actual alleles, if available + if ( alleleMap != null ) { + myAlleles.add(alleleMap.get(a1)); + myAlleles.add(alleleMap.get(a2)); + } else { + // ignore indels (which we can't handle without knowing the alleles) + if ( genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") ) + continue; + + Allele allele1 = Allele.create(a1, refSNPAllele.basesMatch(a1)); + Allele allele2 = Allele.create(a2, refSNPAllele.basesMatch(a2)); + + myAlleles.add(allele1); + myAlleles.add(allele2); + alleles.add(allele1); + alleles.add(allele2); + } Genotype g = new Genotype(samples[i], myAlleles); genotypes.put(samples[i], g); } - VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), hapmap.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, new HashMap()); + HashMap attrs = new HashMap(1); + attrs.put(VariantContext.ID_KEY, hapmap.getName()); + + long end = hapmap.getEnd(); + if ( deletionLength > 0 ) + end += deletionLength; + VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs); return vc; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/VariantsToVCF.java b/java/src/org/broadinstitute/sting/gatk/walkers/VariantsToVCF.java index d5b8445f7..ad18dbf92 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/VariantsToVCF.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/VariantsToVCF.java @@ -25,17 +25,25 @@ package org.broadinstitute.sting.gatk.walkers; +import net.sf.samtools.util.CloseableIterator; +import org.broad.tribble.dbsnp.DbSNPCodec; +import org.broad.tribble.dbsnp.DbSNPFeature; import org.broad.tribble.hapmap.HapMapFeature; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.vcf.VCFUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.SampleUtils; @@ -46,11 +54,12 @@ import java.util.*; * Converts variants from other file formats to VCF format. */ @Requires(value={},referenceMetaData=@RMD(name=VariantsToVCF.INPUT_ROD_NAME, type=VariantContext.class)) -@Reference(window=@Window(start=0,stop=40)) +@Reference(window=@Window(start=-40,stop=40)) public class VariantsToVCF extends RodWalker { @Output(doc="File to which variants should be written",required=true) - protected VCFWriter vcfwriter = null; + protected VCFWriter baseWriter = null; + private SortingVCFWriter vcfwriter; // needed because hapmap indel records move public static final String INPUT_ROD_NAME = "variant"; @@ -64,19 +73,27 @@ public class VariantsToVCF extends RodWalker { private EnumSet ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL, VariantContext.Type.MNP); + // for dealing with indels in hapmap + CloseableIterator dbsnpIterator = null; + + public void initialize() { + vcfwriter = new SortingVCFWriter(baseWriter, 40, false); + } + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) ) return 0; String rsID = DbSNPHelper.rsIDOfFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); - Collection contexts = tracker.getVariantContexts(ref, INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false); + Collection contexts = getVariantContexts(tracker, ref); for ( VariantContext vc : contexts ) { Map attrs = new HashMap(vc.getAttributes()); - if ( rsID != null ) + if ( rsID != null && !vc.hasID() ) { attrs.put(VariantContext.ID_KEY, rsID); - vc = VariantContext.modifyAttributes(vc, attrs); + vc = VariantContext.modifyAttributes(vc, attrs); + } // set the appropriate sample name if necessary if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(INPUT_ROD_NAME) ) { @@ -92,6 +109,88 @@ public class VariantsToVCF extends RodWalker { return 1; } + private Collection getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref) { + // we need to special case the HapMap format because indels aren't handled correctly + List features = tracker.getReferenceMetaData(INPUT_ROD_NAME, true); + if ( features.size() > 0 && features.get(0) instanceof HapMapFeature ) { + ArrayList hapmapVCs = new ArrayList(features.size()); + for ( Object feature : features ) { + HapMapFeature hapmap = (HapMapFeature)feature; + Byte refBase = null; + + // if it's an indel, we need to figure out the alleles + if ( hapmap.getAlleles()[0].equals("-") ) { + Map alleleMap = new HashMap(2); + + // get the dbsnp object corresponding to this record, so we can learn whether this is an insertion or deletion + DbSNPFeature dbsnp = getDbsnpFeature(hapmap.getName()); + if ( dbsnp == null || dbsnp.getVariantType().equalsIgnoreCase("mixed") ) + continue; + + boolean isInsertion = dbsnp.getVariantType().equalsIgnoreCase("insertion"); + + alleleMap.put(HapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, isInsertion)); + alleleMap.put(HapMapFeature.INSERTION, Allele.create(hapmap.getAlleles()[1], !isInsertion)); + hapmap.setActualAlleles(alleleMap); + + // also, use the correct positioning for insertions + if ( isInsertion ) + hapmap.updatePosition(dbsnp.getStart()); + else + hapmap.updatePosition(dbsnp.getStart() - 1); + + if ( hapmap.getStart() < ref.getWindow().getStart() ) { + logger.warn("Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping..."); + continue; + } + refBase = ref.getBases()[hapmap.getStart() - ref.getWindow().getStart()]; + } + VariantContext vc = VariantContextAdaptors.toVariantContext(INPUT_ROD_NAME, hapmap, ref); + if ( vc != null ) { + if ( refBase != null ) { + Map attrs = new HashMap(vc.getAttributes()); + attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refBase); + vc = VariantContext.modifyAttributes(vc, attrs); + } + hapmapVCs.add(vc); + } + } + return hapmapVCs; + } + + // for everything else, we can just convert to VariantContext + return tracker.getVariantContexts(ref, INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, ref.getLocus(), true, false); + } + + private DbSNPFeature getDbsnpFeature(String rsID) { + if ( dbsnpIterator == null ) { + ReferenceOrderedDataSource dbsnpDataSource = null; + for ( ReferenceOrderedDataSource ds : getToolkit().getRodDataSources() ) { + if ( ds.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + dbsnpDataSource = ds; + break; + } + } + + if ( dbsnpDataSource == null ) + throw new UserException.BadInput("No dbSNP rod was provided, but one is needed to decipher the correct indel alleles from the HapMap records"); + + RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe); + dbsnpIterator = builder.createInstanceOfTrack(DbSNPCodec.class, DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, dbsnpDataSource.getReferenceOrderedData().getFile()).getIterator(); + // Note that we should really use some sort of seekable iterator here so that the search doesn't take forever + // (but it's complicated because the hapmap location doesn't match the dbsnp location, so we don't know where to seek to) + } + + while ( dbsnpIterator.hasNext() ) { + GATKFeature feature = dbsnpIterator.next(); + DbSNPFeature dbsnp = (DbSNPFeature)feature.getUnderlyingObject(); + if ( dbsnp.getRsID().equals(rsID) ) + return dbsnp; + } + + return null; + } + private void writeRecord(VariantContext vc, RefMetaDataTracker tracker, byte ref) { if ( !wroteHeader ) { wroteHeader = true; @@ -133,7 +232,7 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - vcfwriter.add(vc,ref); + vcfwriter.add(vc, ref); } public Integer reduceInit() { @@ -144,5 +243,9 @@ public class VariantsToVCF extends RodWalker { return value + sum; } - public void onTraversalDone(Integer sum) {} + public void onTraversalDone(Integer sum) { + if ( dbsnpIterator != null ) + dbsnpIterator.close(); + vcfwriter.close(); + } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/VariantsToVCFIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/VariantsToVCFIntegrationTest.java index b9aadc2b8..52b4ee916 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/VariantsToVCFIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/VariantsToVCFIntegrationTest.java @@ -56,7 +56,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { @Test public void testGenotypesToVCFUsingHapMapInput() { List md5 = new ArrayList(); - md5.add("946d53387d8048915f880101169620b4"); + md5.add("6f34528569f8cf5941cb365fa77288c1"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference +