From 708e973911761e99ff106efadd30d55231142ca5 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 27 Oct 2010 06:08:26 +0000 Subject: [PATCH] Adding a walker to left-align indels in a VCF file (was able to reuse code from AlignmentUtils to do the hard part). The code correctly updates the alleles if they change. This makes it much easier to compare our indel calls to e.g. CG or dbSNP. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4582 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantutils/LeftAlignVariants.java | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java new file mode 100755 index 000000000..d2ce95985 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFWriter; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import java.util.*; + +/** + * Left-aligns indels from a variants file. + */ +@Reference(window=@Window(start=-200,stop=200)) +@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class)) +public class LeftAlignVariants extends RodWalker { + + @Output(doc="File to which variants should be written",required=true) + protected VCFWriter writer = null; + + public void initialize() { + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant")); + Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); + + Set headerLines = vcfHeaders.get("variant").getMetaData(); + writer.writeHeader(new VCFHeader(headerLines, samples)); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + for ( VariantContext vc : VCs ) + alignAndWrite(vc, ref); + + return 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { return 0; } + + public void onTraversalDone(Integer result) {} + + private void alignAndWrite(VariantContext vc, final ReferenceContext ref) { + if ( vc.isBiallelic() && vc.isIndel() ) + writeLeftAlignedIndel(vc, ref); + else + writer.add(vc, ref.getBase()); + } + + private void writeLeftAlignedIndel(final VariantContext vc, final ReferenceContext ref) { + final byte[] refSeq = ref.getBases(); + + // get the indel length + int indelLength; + if ( vc.isDeletion() ) + indelLength = vc.getReference().length(); + else + indelLength = vc.getAlternateAllele(0).length(); + + // create an indel haplotype + int originalIndex = (int)(ref.getLocus().getStart() - ref.getWindow().getStart()) + 1; + final byte[] originalIndel = makeHaplotype(vc, refSeq, originalIndex, indelLength); + + // create a CIGAR string to represent the event + ArrayList elements = new ArrayList(); + elements.add(new CigarElement(originalIndex, CigarOperator.M)); + elements.add(new CigarElement(indelLength, vc.isDeletion() ? CigarOperator.D : CigarOperator.I)); + elements.add(new CigarElement(refSeq.length - originalIndex, CigarOperator.M)); + Cigar originalCigar = new Cigar(elements); + + // left align the CIGAR + Cigar newCigar = AlignmentUtils.leftAlignIndel(originalCigar, refSeq, originalIndel, 0, 0); + + // update if necessary and write + if ( !newCigar.equals(originalCigar) ) { + int difference = originalIndex - newCigar.getCigarElement(0).getLength(); + GenomeLoc newLoc = GenomeLocParser.createPotentiallyInvalidGenomeLoc(vc.getChr(), vc.getStart()-difference, vc.getEnd()-difference); + //System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + newLoc); + VariantContext newVC = VariantContextUtils.modifyLocation(vc, newLoc); + + int indelIndex = originalIndex-difference; + byte[] newBases = new byte[indelLength]; + System.arraycopy(refSeq, indelIndex, newBases, 0, indelLength); + Allele newAllele = Allele.create(newBases, vc.isDeletion()); + newVC = updateAllele(newVC, newAllele); + + writer.add(newVC, refSeq[indelIndex-1]); + } else { + writer.add(vc, ref.getBase()); + } + } + + private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { + byte[] hap = new byte[ref.length + (indelLength * (vc.isDeletion() ? -1 : 1))]; + + // add the bases before the indel + System.arraycopy(ref, 0, hap, 0, indexOfRef); + int currentPos = indexOfRef; + + // take care of the indel + if ( vc.isDeletion() ) { + indexOfRef += indelLength; + } else { + System.arraycopy(vc.getAlternateAllele(0).getBases(), 0, hap, currentPos, indelLength); + currentPos += indelLength; + } + + // add the bases after the indel + System.arraycopy(ref, indexOfRef, hap, currentPos, ref.length - indexOfRef); + + return hap; + } + + public static VariantContext updateAllele(VariantContext vc, Allele newAllele) { + // create a mapping from original allele to new allele + HashMap alleleMap = new HashMap(vc.getAlleles().size()); + if ( newAllele.isReference() ) { + alleleMap.put(vc.getReference(), newAllele); + alleleMap.put(vc.getAlternateAllele(0), vc.getAlternateAllele(0)); + } else { + alleleMap.put(vc.getReference(), vc.getReference()); + alleleMap.put(vc.getAlternateAllele(0), newAllele); + } + + // create new Genotype objects + Map newGenotypes = new HashMap(vc.getNSamples()); + for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + List newAlleles = new ArrayList(); + for ( Allele allele : genotype.getValue().getAlleles() ) { + Allele newA = alleleMap.get(allele); + if ( newA == null ) + newA = Allele.NO_CALL; + newAlleles.add(newA); + } + newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); + } + + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); + } +}