diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java new file mode 100755 index 000000000..351117809 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java @@ -0,0 +1,77 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.IndelUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 6/29/11 + * Time: 3:14 PM + * To change this template use File | Settings | File Templates. + */ +public class TechnologyComposition implements ExperimentalAnnotation,InfoFieldAnnotation { + private String nSLX = "NumSLX"; + private String n454 ="Num454"; + private String nSolid = "NumSOLiD"; + private String nOther = "NumOther"; + public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int readsIllumina = 0; + int readsSolid = 0; + int reads454 = 0; + int readsOther = 0; + + for ( Map.Entry sample : stratifiedContexts.entrySet() ) { + AlignmentContext context = sample.getValue(); + + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup != null) { + for (PileupElement p : pileup ) { + if(ReadUtils.is454Read(p.getRead())) + reads454++; + else if (ReadUtils.isSOLiDRead(p.getRead())) + readsSolid++; + else if (ReadUtils.isSLXRead(p.getRead())) + readsIllumina++; + else + readsOther++; + } + } + } + + Map map = new HashMap(); + map.put(nSLX, String.format("%d", readsIllumina)); + map.put(n454, String.format("%d", reads454)); + map.put(nSolid, String.format("%d", readsSolid)); + map.put(nOther, String.format("%d", readsOther)); + return map; + } + + public List getKeyNames() { return Arrays.asList(nSLX,n454,nSolid,nOther); } + + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(nSLX, 1, VCFHeaderLineType.Integer, "Number of SLX reads"), + new VCFInfoHeaderLine(n454, 1, VCFHeaderLineType.Integer, "Number of 454 reads"), + new VCFInfoHeaderLine(nSolid, 1, VCFHeaderLineType.Integer, "Number of SOLiD reads"), + new VCFInfoHeaderLine(nOther, 1, VCFHeaderLineType.Integer, "Number of Other technology reads")); } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 963aa0ce5..cbac54326 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,14 +24,13 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -42,9 +41,10 @@ import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.apache.log4j.Logger; import java.io.File; +import java.lang.annotation.AnnotationFormatError; import java.util.*; /** @@ -75,12 +75,24 @@ public class SelectVariants extends RodWalker { @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis.", required=false) private boolean EXCLUDE_FILTERED = false; + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't include filtered loci.", required=false) + private boolean KEEP_ORIGINAL_CHR_COUNTS = false; + @Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called on a ROD comparison track. Use -disc ROD_NAME", required=false) private String discordanceRodName = ""; @Argument(fullName="concordance", shortName = "conc", doc="Output variants that were also called on a ROD comparison track. Use -conc ROD_NAME", required=false) private String concordanceRodName = ""; + @Hidden + @Argument(fullName="inputAF", shortName = "inputAF", doc="", required=false) + private String inputAFRodName = ""; + + @Hidden + @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + private boolean KEEP_AF_SPECTRUM = false; + + @Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) private String FAMILY_STRUCTURE = ""; @@ -178,6 +190,12 @@ public class SelectVariants extends RodWalker { // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); + + if (KEEP_ORIGINAL_CHR_COUNTS) { + headerLines.add(new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC")); + headerLines.add(new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF")); + headerLines.add(new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN")); + } vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { @@ -269,6 +287,45 @@ public class SelectVariants extends RodWalker { else if (!SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom) { vcfWriter.add(sub, ref.getBase()); } + else { + if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) { + Collection compVCs = tracker.getVariantContexts(ref, inputAFRodName, null, context.getLocation(), true, false); + if (compVCs.isEmpty()) + return 0; + + // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName. + // We then pick a variant with probablity AF*desiredFraction + for (VariantContext compVC : compVCs) { + if ( compVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { + String afo = compVC.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY); + + double af; + if (afo.contains(",")) { + String[] afs = afo.split(","); + afs[0] = afs[0].substring(1,afs[0].length()); + afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); + + double[] afd = new double[afs.length]; + + for (int k=0; k < afd.length; k++) + afd[k] = Double.valueOf(afs[k]); + + af = MathUtils.arrayMax(afd); + //af = Double.valueOf(afs[0]); + + } + else + af = Double.valueOf(afo); + + //System.out.format("%s .. %4.4f\n",afo.toString(), af); + if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * af) + vcfWriter.add(sub, ref.getBase()); + } + break; // do only one vc + } + + } + } } } @@ -413,6 +470,16 @@ public class SelectVariants extends RodWalker { } + if (KEEP_ORIGINAL_CHR_COUNTS) { + if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) + attributes.put("AC_Orig",attributes.get(VCFConstants.ALLELE_COUNT_KEY)); + if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) + attributes.put("AF_Orig",attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY)); + if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) + attributes.put("AN_Orig",attributes.get(VCFConstants.ALLELE_NUMBER_KEY)); + + } + VariantContextUtils.calculateChromosomeCounts(sub,attributes,false); attributes.put("DP", depth);