Two dumb oneoff walkers written to fix & annotate the Baylor indel calls (which came in sans reference, and without coding/intron annotations).

ERIC -- does the IndelAnnotator (the RefSeq lookup code I stole from IndelGentoyperV2) want to be its own Annotation inside VariantAnnotator? Is Andrey already doing this as part of adding indel calling to UG?



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3226 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-04-21 14:04:10 +00:00
parent 2fdc1cf490
commit 84f1ccd6ac
2 changed files with 196 additions and 0 deletions

View File

@ -0,0 +1,113 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.util.*;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Apr 21, 2010
*/
public class IndelAnnotator extends RodWalker<Integer,Long>{
@Argument(fullName="refseq", shortName="refseq",
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=true)
String RefseqFileName = null;
private static String annGenomic = "GENOMIC";
private static String annIntron = "INTRON";
private static String annUTR = "UTR";
private static String annCoding = "CODING";
private static String annUnknown = "UNKNOWN";
private SeekableRODIterator refseqIterator;
private VCFWriter vcfWriter;
private String getAnnotationString(RODRecordList ann) {
if ( ann == null ) return annGenomic;
else {
StringBuilder b = new StringBuilder();
if ( rodRefSeq.isExon(ann) ) {
if ( rodRefSeq.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
else b.append(annUTR); // exon but not coding = UTR
} else {
if ( rodRefSeq.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
}
b.append('\t');
b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed
return b.toString();
}
}
public void initialize() {
if ( RefseqFileName != null ) {
ReferenceOrderedData<rodRefSeq> refseq = new ReferenceOrderedData<rodRefSeq>("refseq",
new java.io.File(RefseqFileName), rodRefSeq.class);
refseqIterator = new SeekableRODIterator(new GATKFeatureIterator(refseq.iterator()));
logger.info("Using RefSeq annotations from "+RefseqFileName);
}
if ( refseqIterator == null ) logger.info("No annotations available");
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "IndelAnnotator"));
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
HashSet<VCFInfoHeaderLine> anno = new HashSet<VCFInfoHeaderLine>();
anno.add(new VCFInfoHeaderLine("type",1,VCFInfoHeaderLine.INFO_TYPE.String,"Genomic interpretation (according to RefSeq)"));
hInfo.addAll(anno);
vcfWriter = new VCFWriter(out);
VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit()));
vcfWriter.writeHeader(vcfHeader);
}
public Long reduceInit() {
return 0l;
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) {
if ( tracker == null )
return 0;
List<Object> rods = tracker.getReferenceMetaData("variant");
// ignore places where we don't have a variant
if ( rods.size() == 0 )
return 0;
Object variant = rods.get(0);
if ( variant instanceof RodVCF ) {
RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(ref.getLocus()));
String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
annotationString = annotationString.split("\\s+")[0];
((RodVCF) variant).getRecord().addInfoField("type",annotationString);
vcfWriter.addRecord(((RodVCF) variant).getRecord());
} else {
throw new StingException("This one-off walker only deals with VCF files.");
}
return 1;
}
public Long reduce(Integer i, Long j) {
return i + j;
}
}

View File

@ -0,0 +1,83 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.util.*;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Apr 13, 2010
*/
public class VCFReferenceFixerWalker extends RodWalker<VCFRecord,Long> {
private VCFWriter vcfWriter;
public void initialize() {
TreeSet<String> samples = new TreeSet<String>();
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "VariantAnnotator"));
vcfWriter = new VCFWriter(out);
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
}
public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext context, AlignmentContext alicon) {
if ( tracker == null ) {
return null;
}
List<Object> rods = tracker.getReferenceMetaData("fixme");
Object rod = rods.get(0);
RodVCF vcfrod = null;
if ( rod instanceof RodVCF ) {
vcfrod = (RodVCF) rod;
}
VCFRecord rec = vcfrod.getRecord();
rec.setReferenceBase(new String(BaseUtils.charSeq2byteSeq(context.getBases())));
return rec;
/*
VariantContext vcon = null;
if ( rod instanceof RodVCF) {
vcon = VariantContextAdaptors.toVariantContext("fixme", (RodVCF) rod, new Allele(BaseUtils.charSeq2byteSeq(context.getBases()),true));
}
if ( vcon == null ) {
return null;
}
Set<Allele> otherAlleles = vcon.getAlternateAlleles();
VariantContext fixedContext = new VariantContext(vcon.getName(),context.getLocus(),otherAlleles,vcon.getGenotypes(),vcon.getNegLog10PError(),vcon.getFilters(),vcon.getAttributes());
return VariantContextAdaptors.toVCF(fixedContext,context.getBase());*/
}
public Long reduce(VCFRecord con, Long num) {
if ( con == null ) {
return num;
}
vcfWriter.addRecord(con);
return 1 + num;
}
public Long reduceInit() {
return 0l;
}
public void onTraversalDone(Long num){
return;
}
}