Support for indels in hapmap. This was non-trivial because not only does hapmap not tell you whether the allele is an insertion or deletion, but it also has a completely different positioning strategy (rightmost base). I'll send out an email tomorrow when the new HapMap3.3 VCF is ready.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4908 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6d809321d3
commit
8a0c07b865
|
|
@ -11,7 +11,6 @@ import org.broad.tribble.vcf.*;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -218,10 +217,21 @@ public class VariantContextAdaptors {
|
|||
|
||||
HapMapFeature hapmap = (HapMapFeature)input;
|
||||
|
||||
// add the reference allele
|
||||
HashSet<Allele> alleles = new HashSet<Allele>();
|
||||
Allele refAllele = Allele.create(ref.getBase(), true);
|
||||
alleles.add(refAllele);
|
||||
Allele refSNPAllele = Allele.create(ref.getBase(), true);
|
||||
int deletionLength = -1;
|
||||
|
||||
Map<String, Allele> alleleMap = hapmap.getActualAlleles();
|
||||
// use the actual alleles, if available
|
||||
if ( alleleMap != null ) {
|
||||
alleles.addAll(alleleMap.values());
|
||||
Allele deletionAllele = alleleMap.get(HapMapFeature.INSERTION); // yes, use insertion here (since we want the reference bases)
|
||||
if ( deletionAllele != null && deletionAllele.isReference() )
|
||||
deletionLength = deletionAllele.length();
|
||||
} else {
|
||||
// add the reference allele for SNPs
|
||||
alleles.add(refSNPAllele);
|
||||
}
|
||||
|
||||
// make a mapping from sample to genotype
|
||||
String[] samples = hapmap.getSampleIDs();
|
||||
|
|
@ -230,26 +240,42 @@ public class VariantContextAdaptors {
|
|||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(samples.length);
|
||||
for ( int i = 0; i < samples.length; i++ ) {
|
||||
// ignore bad genotypes
|
||||
if ( genotypeStrings[i].contains("N") || genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") )
|
||||
if ( genotypeStrings[i].contains("N") )
|
||||
continue;
|
||||
|
||||
String a1 = genotypeStrings[i].substring(0,1);
|
||||
String a2 = genotypeStrings[i].substring(1);
|
||||
|
||||
Allele allele1 = Allele.create(a1, refAllele.basesMatch(a1));
|
||||
Allele allele2 = Allele.create(a2, refAllele.basesMatch(a2));
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
|
||||
myAlleles.add(allele1);
|
||||
myAlleles.add(allele2);
|
||||
alleles.add(allele1);
|
||||
alleles.add(allele2);
|
||||
|
||||
// use the mapping to actual alleles, if available
|
||||
if ( alleleMap != null ) {
|
||||
myAlleles.add(alleleMap.get(a1));
|
||||
myAlleles.add(alleleMap.get(a2));
|
||||
} else {
|
||||
// ignore indels (which we can't handle without knowing the alleles)
|
||||
if ( genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") )
|
||||
continue;
|
||||
|
||||
Allele allele1 = Allele.create(a1, refSNPAllele.basesMatch(a1));
|
||||
Allele allele2 = Allele.create(a2, refSNPAllele.basesMatch(a2));
|
||||
|
||||
myAlleles.add(allele1);
|
||||
myAlleles.add(allele2);
|
||||
alleles.add(allele1);
|
||||
alleles.add(allele2);
|
||||
}
|
||||
|
||||
Genotype g = new Genotype(samples[i], myAlleles);
|
||||
genotypes.put(samples[i], g);
|
||||
}
|
||||
|
||||
VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), hapmap.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, new HashMap<String, String>());
|
||||
HashMap<String, Object> attrs = new HashMap<String, Object>(1);
|
||||
attrs.put(VariantContext.ID_KEY, hapmap.getName());
|
||||
|
||||
long end = hapmap.getEnd();
|
||||
if ( deletionLength > 0 )
|
||||
end += deletionLength;
|
||||
VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs);
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,17 +25,25 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broad.tribble.dbsnp.DbSNPCodec;
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.hapmap.HapMapFeature;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
|
@ -46,11 +54,12 @@ import java.util.*;
|
|||
* Converts variants from other file formats to VCF format.
|
||||
*/
|
||||
@Requires(value={},referenceMetaData=@RMD(name=VariantsToVCF.INPUT_ROD_NAME, type=VariantContext.class))
|
||||
@Reference(window=@Window(start=0,stop=40))
|
||||
@Reference(window=@Window(start=-40,stop=40))
|
||||
public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
||||
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter vcfwriter = null;
|
||||
protected VCFWriter baseWriter = null;
|
||||
private SortingVCFWriter vcfwriter; // needed because hapmap indel records move
|
||||
|
||||
public static final String INPUT_ROD_NAME = "variant";
|
||||
|
||||
|
|
@ -64,19 +73,27 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
private EnumSet<VariantContext.Type> ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP,
|
||||
VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL, VariantContext.Type.MNP);
|
||||
|
||||
// for dealing with indels in hapmap
|
||||
CloseableIterator<GATKFeature> dbsnpIterator = null;
|
||||
|
||||
public void initialize() {
|
||||
vcfwriter = new SortingVCFWriter(baseWriter, 40, false);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) )
|
||||
return 0;
|
||||
|
||||
String rsID = DbSNPHelper.rsIDOfFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
|
||||
Collection<VariantContext> contexts = tracker.getVariantContexts(ref, INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false);
|
||||
Collection<VariantContext> contexts = getVariantContexts(tracker, ref);
|
||||
|
||||
for ( VariantContext vc : contexts ) {
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
if ( rsID != null )
|
||||
if ( rsID != null && !vc.hasID() ) {
|
||||
attrs.put(VariantContext.ID_KEY, rsID);
|
||||
vc = VariantContext.modifyAttributes(vc, attrs);
|
||||
vc = VariantContext.modifyAttributes(vc, attrs);
|
||||
}
|
||||
|
||||
// set the appropriate sample name if necessary
|
||||
if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(INPUT_ROD_NAME) ) {
|
||||
|
|
@ -92,6 +109,88 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
private Collection<VariantContext> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref) {
|
||||
// we need to special case the HapMap format because indels aren't handled correctly
|
||||
List<Object> features = tracker.getReferenceMetaData(INPUT_ROD_NAME, true);
|
||||
if ( features.size() > 0 && features.get(0) instanceof HapMapFeature ) {
|
||||
ArrayList<VariantContext> hapmapVCs = new ArrayList<VariantContext>(features.size());
|
||||
for ( Object feature : features ) {
|
||||
HapMapFeature hapmap = (HapMapFeature)feature;
|
||||
Byte refBase = null;
|
||||
|
||||
// if it's an indel, we need to figure out the alleles
|
||||
if ( hapmap.getAlleles()[0].equals("-") ) {
|
||||
Map<String, Allele> alleleMap = new HashMap<String, Allele>(2);
|
||||
|
||||
// get the dbsnp object corresponding to this record, so we can learn whether this is an insertion or deletion
|
||||
DbSNPFeature dbsnp = getDbsnpFeature(hapmap.getName());
|
||||
if ( dbsnp == null || dbsnp.getVariantType().equalsIgnoreCase("mixed") )
|
||||
continue;
|
||||
|
||||
boolean isInsertion = dbsnp.getVariantType().equalsIgnoreCase("insertion");
|
||||
|
||||
alleleMap.put(HapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, isInsertion));
|
||||
alleleMap.put(HapMapFeature.INSERTION, Allele.create(hapmap.getAlleles()[1], !isInsertion));
|
||||
hapmap.setActualAlleles(alleleMap);
|
||||
|
||||
// also, use the correct positioning for insertions
|
||||
if ( isInsertion )
|
||||
hapmap.updatePosition(dbsnp.getStart());
|
||||
else
|
||||
hapmap.updatePosition(dbsnp.getStart() - 1);
|
||||
|
||||
if ( hapmap.getStart() < ref.getWindow().getStart() ) {
|
||||
logger.warn("Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping...");
|
||||
continue;
|
||||
}
|
||||
refBase = ref.getBases()[hapmap.getStart() - ref.getWindow().getStart()];
|
||||
}
|
||||
VariantContext vc = VariantContextAdaptors.toVariantContext(INPUT_ROD_NAME, hapmap, ref);
|
||||
if ( vc != null ) {
|
||||
if ( refBase != null ) {
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refBase);
|
||||
vc = VariantContext.modifyAttributes(vc, attrs);
|
||||
}
|
||||
hapmapVCs.add(vc);
|
||||
}
|
||||
}
|
||||
return hapmapVCs;
|
||||
}
|
||||
|
||||
// for everything else, we can just convert to VariantContext
|
||||
return tracker.getVariantContexts(ref, INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, ref.getLocus(), true, false);
|
||||
}
|
||||
|
||||
private DbSNPFeature getDbsnpFeature(String rsID) {
|
||||
if ( dbsnpIterator == null ) {
|
||||
ReferenceOrderedDataSource dbsnpDataSource = null;
|
||||
for ( ReferenceOrderedDataSource ds : getToolkit().getRodDataSources() ) {
|
||||
if ( ds.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
dbsnpDataSource = ds;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( dbsnpDataSource == null )
|
||||
throw new UserException.BadInput("No dbSNP rod was provided, but one is needed to decipher the correct indel alleles from the HapMap records");
|
||||
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe);
|
||||
dbsnpIterator = builder.createInstanceOfTrack(DbSNPCodec.class, DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, dbsnpDataSource.getReferenceOrderedData().getFile()).getIterator();
|
||||
// Note that we should really use some sort of seekable iterator here so that the search doesn't take forever
|
||||
// (but it's complicated because the hapmap location doesn't match the dbsnp location, so we don't know where to seek to)
|
||||
}
|
||||
|
||||
while ( dbsnpIterator.hasNext() ) {
|
||||
GATKFeature feature = dbsnpIterator.next();
|
||||
DbSNPFeature dbsnp = (DbSNPFeature)feature.getUnderlyingObject();
|
||||
if ( dbsnp.getRsID().equals(rsID) )
|
||||
return dbsnp;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private void writeRecord(VariantContext vc, RefMetaDataTracker tracker, byte ref) {
|
||||
if ( !wroteHeader ) {
|
||||
wroteHeader = true;
|
||||
|
|
@ -133,7 +232,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
|
||||
vcfwriter.add(vc,ref);
|
||||
vcfwriter.add(vc, ref);
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
@ -144,5 +243,9 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
return value + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {}
|
||||
public void onTraversalDone(Integer sum) {
|
||||
if ( dbsnpIterator != null )
|
||||
dbsnpIterator.close();
|
||||
vcfwriter.close();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testGenotypesToVCFUsingHapMapInput() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("946d53387d8048915f880101169620b4");
|
||||
md5.add("6f34528569f8cf5941cb365fa77288c1");
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
|
|
|
|||
Loading…
Reference in New Issue