Support for indels in hapmap. This was non-trivial because not only does hapmap not tell you whether the allele is an insertion or deletion, but it also has a completely different positioning strategy (rightmost base). I'll send out an email tomorrow when the new HapMap3.3 VCF is ready.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4908 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-12-27 07:37:46 +00:00
parent 6d809321d3
commit 8a0c07b865
3 changed files with 151 additions and 22 deletions

View File

@ -11,7 +11,6 @@ import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.*;
import java.util.*;
@ -218,10 +217,21 @@ public class VariantContextAdaptors {
HapMapFeature hapmap = (HapMapFeature)input;
// add the reference allele
HashSet<Allele> alleles = new HashSet<Allele>();
Allele refAllele = Allele.create(ref.getBase(), true);
alleles.add(refAllele);
Allele refSNPAllele = Allele.create(ref.getBase(), true);
int deletionLength = -1;
Map<String, Allele> alleleMap = hapmap.getActualAlleles();
// use the actual alleles, if available
if ( alleleMap != null ) {
alleles.addAll(alleleMap.values());
Allele deletionAllele = alleleMap.get(HapMapFeature.INSERTION); // yes, use insertion here (since we want the reference bases)
if ( deletionAllele != null && deletionAllele.isReference() )
deletionLength = deletionAllele.length();
} else {
// add the reference allele for SNPs
alleles.add(refSNPAllele);
}
// make a mapping from sample to genotype
String[] samples = hapmap.getSampleIDs();
@ -230,26 +240,42 @@ public class VariantContextAdaptors {
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(samples.length);
for ( int i = 0; i < samples.length; i++ ) {
// ignore bad genotypes
if ( genotypeStrings[i].contains("N") || genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") )
if ( genotypeStrings[i].contains("N") )
continue;
String a1 = genotypeStrings[i].substring(0,1);
String a2 = genotypeStrings[i].substring(1);
Allele allele1 = Allele.create(a1, refAllele.basesMatch(a1));
Allele allele2 = Allele.create(a2, refAllele.basesMatch(a2));
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
myAlleles.add(allele1);
myAlleles.add(allele2);
alleles.add(allele1);
alleles.add(allele2);
// use the mapping to actual alleles, if available
if ( alleleMap != null ) {
myAlleles.add(alleleMap.get(a1));
myAlleles.add(alleleMap.get(a2));
} else {
// ignore indels (which we can't handle without knowing the alleles)
if ( genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") )
continue;
Allele allele1 = Allele.create(a1, refSNPAllele.basesMatch(a1));
Allele allele2 = Allele.create(a2, refSNPAllele.basesMatch(a2));
myAlleles.add(allele1);
myAlleles.add(allele2);
alleles.add(allele1);
alleles.add(allele2);
}
Genotype g = new Genotype(samples[i], myAlleles);
genotypes.put(samples[i], g);
}
VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), hapmap.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, new HashMap<String, String>());
HashMap<String, Object> attrs = new HashMap<String, Object>(1);
attrs.put(VariantContext.ID_KEY, hapmap.getName());
long end = hapmap.getEnd();
if ( deletionLength > 0 )
end += deletionLength;
VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs);
return vc;
}
}

View File

@ -25,17 +25,25 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.util.CloseableIterator;
import org.broad.tribble.dbsnp.DbSNPCodec;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.hapmap.HapMapFeature;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.SampleUtils;
@ -46,11 +54,12 @@ import java.util.*;
* Converts variants from other file formats to VCF format.
*/
@Requires(value={},referenceMetaData=@RMD(name=VariantsToVCF.INPUT_ROD_NAME, type=VariantContext.class))
@Reference(window=@Window(start=0,stop=40))
@Reference(window=@Window(start=-40,stop=40))
public class VariantsToVCF extends RodWalker<Integer, Integer> {
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfwriter = null;
protected VCFWriter baseWriter = null;
private SortingVCFWriter vcfwriter; // needed because hapmap indel records move
public static final String INPUT_ROD_NAME = "variant";
@ -64,19 +73,27 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
private EnumSet<VariantContext.Type> ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP,
VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL, VariantContext.Type.MNP);
// for dealing with indels in hapmap
CloseableIterator<GATKFeature> dbsnpIterator = null;
public void initialize() {
vcfwriter = new SortingVCFWriter(baseWriter, 40, false);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) )
return 0;
String rsID = DbSNPHelper.rsIDOfFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
Collection<VariantContext> contexts = tracker.getVariantContexts(ref, INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false);
Collection<VariantContext> contexts = getVariantContexts(tracker, ref);
for ( VariantContext vc : contexts ) {
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
if ( rsID != null )
if ( rsID != null && !vc.hasID() ) {
attrs.put(VariantContext.ID_KEY, rsID);
vc = VariantContext.modifyAttributes(vc, attrs);
vc = VariantContext.modifyAttributes(vc, attrs);
}
// set the appropriate sample name if necessary
if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(INPUT_ROD_NAME) ) {
@ -92,6 +109,88 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
return 1;
}
private Collection<VariantContext> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref) {
// we need to special case the HapMap format because indels aren't handled correctly
List<Object> features = tracker.getReferenceMetaData(INPUT_ROD_NAME, true);
if ( features.size() > 0 && features.get(0) instanceof HapMapFeature ) {
ArrayList<VariantContext> hapmapVCs = new ArrayList<VariantContext>(features.size());
for ( Object feature : features ) {
HapMapFeature hapmap = (HapMapFeature)feature;
Byte refBase = null;
// if it's an indel, we need to figure out the alleles
if ( hapmap.getAlleles()[0].equals("-") ) {
Map<String, Allele> alleleMap = new HashMap<String, Allele>(2);
// get the dbsnp object corresponding to this record, so we can learn whether this is an insertion or deletion
DbSNPFeature dbsnp = getDbsnpFeature(hapmap.getName());
if ( dbsnp == null || dbsnp.getVariantType().equalsIgnoreCase("mixed") )
continue;
boolean isInsertion = dbsnp.getVariantType().equalsIgnoreCase("insertion");
alleleMap.put(HapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, isInsertion));
alleleMap.put(HapMapFeature.INSERTION, Allele.create(hapmap.getAlleles()[1], !isInsertion));
hapmap.setActualAlleles(alleleMap);
// also, use the correct positioning for insertions
if ( isInsertion )
hapmap.updatePosition(dbsnp.getStart());
else
hapmap.updatePosition(dbsnp.getStart() - 1);
if ( hapmap.getStart() < ref.getWindow().getStart() ) {
logger.warn("Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping...");
continue;
}
refBase = ref.getBases()[hapmap.getStart() - ref.getWindow().getStart()];
}
VariantContext vc = VariantContextAdaptors.toVariantContext(INPUT_ROD_NAME, hapmap, ref);
if ( vc != null ) {
if ( refBase != null ) {
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refBase);
vc = VariantContext.modifyAttributes(vc, attrs);
}
hapmapVCs.add(vc);
}
}
return hapmapVCs;
}
// for everything else, we can just convert to VariantContext
return tracker.getVariantContexts(ref, INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, ref.getLocus(), true, false);
}
private DbSNPFeature getDbsnpFeature(String rsID) {
if ( dbsnpIterator == null ) {
ReferenceOrderedDataSource dbsnpDataSource = null;
for ( ReferenceOrderedDataSource ds : getToolkit().getRodDataSources() ) {
if ( ds.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
dbsnpDataSource = ds;
break;
}
}
if ( dbsnpDataSource == null )
throw new UserException.BadInput("No dbSNP rod was provided, but one is needed to decipher the correct indel alleles from the HapMap records");
RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe);
dbsnpIterator = builder.createInstanceOfTrack(DbSNPCodec.class, DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, dbsnpDataSource.getReferenceOrderedData().getFile()).getIterator();
// Note that we should really use some sort of seekable iterator here so that the search doesn't take forever
// (but it's complicated because the hapmap location doesn't match the dbsnp location, so we don't know where to seek to)
}
while ( dbsnpIterator.hasNext() ) {
GATKFeature feature = dbsnpIterator.next();
DbSNPFeature dbsnp = (DbSNPFeature)feature.getUnderlyingObject();
if ( dbsnp.getRsID().equals(rsID) )
return dbsnp;
}
return null;
}
private void writeRecord(VariantContext vc, RefMetaDataTracker tracker, byte ref) {
if ( !wroteHeader ) {
wroteHeader = true;
@ -133,7 +232,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
}
vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
vcfwriter.add(vc,ref);
vcfwriter.add(vc, ref);
}
public Integer reduceInit() {
@ -144,5 +243,9 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
return value + sum;
}
public void onTraversalDone(Integer sum) {}
public void onTraversalDone(Integer sum) {
if ( dbsnpIterator != null )
dbsnpIterator.close();
vcfwriter.close();
}
}

View File

@ -56,7 +56,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingHapMapInput() {
List<String> md5 = new ArrayList<String>();
md5.add("946d53387d8048915f880101169620b4");
md5.add("6f34528569f8cf5941cb365fa77288c1");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +