From 84f1ccd6ac1e7090c22b8f7f61826f60eff2eb34 Mon Sep 17 00:00:00 2001 From: chartl Date: Wed, 21 Apr 2010 14:04:10 +0000 Subject: [PATCH] Two dumb oneoff walkers written to fix & annotate the Baylor indel calls (which came in sans reference, and without coding/intron annotations). ERIC -- does the IndelAnnotator (the RefSeq lookup code I stole from IndelGentoyperV2) want to be its own Annotation inside VariantAnnotator? Is Andrey already doing this as part of adding indel calling to UG? git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3226 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/IndelAnnotator.java | 113 ++++++++++++++++++ .../walkers/VCFReferenceFixerWalker.java | 83 +++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java create mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCFReferenceFixerWalker.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java new file mode 100644 index 000000000..171a76ee0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java @@ -0,0 +1,113 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.genotype.vcf.*; + +import java.util.*; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Apr 21, 2010 + */ +public class IndelAnnotator extends RodWalker{ + @Argument(fullName="refseq", shortName="refseq", + doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=true) + String RefseqFileName = null; + + private static String annGenomic = "GENOMIC"; + private static String annIntron = "INTRON"; + private static String annUTR = "UTR"; + private static String annCoding = "CODING"; + private static String annUnknown = "UNKNOWN"; + + private SeekableRODIterator refseqIterator; + private VCFWriter vcfWriter; + + private String getAnnotationString(RODRecordList ann) { + if ( ann == null ) return annGenomic; + else { + StringBuilder b = new StringBuilder(); + + if ( rodRefSeq.isExon(ann) ) { + if ( rodRefSeq.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence + else b.append(annUTR); // exon but not coding = UTR + } else { + if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron + else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... + } + b.append('\t'); + b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed + return b.toString(); + } + } + + public void initialize() { + if ( RefseqFileName != null ) { + ReferenceOrderedData refseq = new ReferenceOrderedData("refseq", + new java.io.File(RefseqFileName), rodRefSeq.class); + + refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator())); + logger.info("Using RefSeq annotations from "+RefseqFileName); + } + + if ( refseqIterator == null ) logger.info("No annotations available"); + + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("source", "IndelAnnotator")); + hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName())); + HashSet anno = new HashSet(); + anno.add(new VCFInfoHeaderLine("type",1,VCFInfoHeaderLine.INFO_TYPE.String,"Genomic interpretation (according to RefSeq)")); + hInfo.addAll(anno); + + vcfWriter = new VCFWriter(out); + VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit())); + vcfWriter.writeHeader(vcfHeader); + } + + public Long reduceInit() { + return 0l; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) { + if ( tracker == null ) + return 0; + + List rods = tracker.getReferenceMetaData("variant"); + // ignore places where we don't have a variant + if ( rods.size() == 0 ) + return 0; + + Object variant = rods.get(0); + + if ( variant instanceof RodVCF ) { + RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(ref.getLocus())); + String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); + annotationString = annotationString.split("\\s+")[0]; + ((RodVCF) variant).getRecord().addInfoField("type",annotationString); + vcfWriter.addRecord(((RodVCF) variant).getRecord()); + } else { + throw new StingException("This one-off walker only deals with VCF files."); + } + + return 1; + } + + public Long reduce(Integer i, Long j) { + return i + j; + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCFReferenceFixerWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCFReferenceFixerWalker.java new file mode 100644 index 000000000..1451ab1b4 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCFReferenceFixerWalker.java @@ -0,0 +1,83 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.genotype.vcf.*; + +import java.util.*; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date Apr 13, 2010 + */ +public class VCFReferenceFixerWalker extends RodWalker { + + private VCFWriter vcfWriter; + + public void initialize() { + TreeSet samples = new TreeSet(); + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("source", "VariantAnnotator")); + vcfWriter = new VCFWriter(out); + VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter.writeHeader(vcfHeader); + } + + public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext context, AlignmentContext alicon) { + if ( tracker == null ) { + return null; + } + List rods = tracker.getReferenceMetaData("fixme"); + Object rod = rods.get(0); + RodVCF vcfrod = null; + if ( rod instanceof RodVCF ) { + vcfrod = (RodVCF) rod; + } + + VCFRecord rec = vcfrod.getRecord(); + rec.setReferenceBase(new String(BaseUtils.charSeq2byteSeq(context.getBases()))); + return rec; + + /* + VariantContext vcon = null; + if ( rod instanceof RodVCF) { + vcon = VariantContextAdaptors.toVariantContext("fixme", (RodVCF) rod, new Allele(BaseUtils.charSeq2byteSeq(context.getBases()),true)); + } + if ( vcon == null ) { + return null; + } + Set otherAlleles = vcon.getAlternateAlleles(); + VariantContext fixedContext = new VariantContext(vcon.getName(),context.getLocus(),otherAlleles,vcon.getGenotypes(),vcon.getNegLog10PError(),vcon.getFilters(),vcon.getAttributes()); + return VariantContextAdaptors.toVCF(fixedContext,context.getBase());*/ + } + + public Long reduce(VCFRecord con, Long num) { + if ( con == null ) { + return num; + } + + vcfWriter.addRecord(con); + return 1 + num; + } + + public Long reduceInit() { + return 0l; + } + + public void onTraversalDone(Long num){ + return; + } +}