From 2bc5971ca1c90a41008bcbd7490fbcc94c56a04b Mon Sep 17 00:00:00 2001 From: chartl Date: Sun, 17 Oct 2010 03:18:01 +0000 Subject: [PATCH] Added - a tool to fix reference bases of a VCF. The OMNI had a couple of sites with incorrect reference bases (look to be legacy from other chips), and a few more that had ref and alt flipped. GAP should probably take care of it, but since I need results by monday, I'm doing it. Modified - SelectVariants: Hook up to VariantContextUtils to recalculate AC/AF/AN, which uses the accessor in VariantContext to do this. Somehow sites that were selected down to hom-ref genotypes only wound up getting positive AC. **IMPORTANT** I kind of need input here. The header of a file used for an integration test specifies AC as being an integer. Recalculating it casts it into an integer list (which it should be, as it allows for alternate alleles). However this appears to clash with what the jexl expression is looking for? For now, the integration test itself needed to be changed -- it's unclear what to do when the header specifies AC of being one class, but recalculating it casts to another class, and I'm not sure what to do. I'm committing my omni_qc pipeline because I'm almost certain 2 months down the road I'm going to wonder what the heck I did to generate my results. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4511 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variantutils/SelectVariants.java | 18 +- .../AlleleFrequencyComparison.java | 63 +++- .../walkers/vcftools/FixRefBases.java | 174 ++++++++++ .../SelectVariantsIntegrationTest.java | 4 +- scala/qscript/chartl/omni_qc.q | 314 ++++++++++++++---- 5 files changed, 490 insertions(+), 83 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index b602f31f9..e8e57986c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -212,6 +212,7 @@ public class SelectVariants extends RodWalker { VariantContext sub = subsetRecord(vc, samples); if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { + //System.out.printf("%s%n",sub.toString()); for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { if ( !VariantContextUtils.match(sub, jexl) ) { return 0; @@ -231,7 +232,7 @@ public class SelectVariants extends RodWalker { * @param vc the VariantContext record to subset * @param samples the samples to extract * @return the subsetted VariantContext - */ + */ private VariantContext subsetRecord(VariantContext vc, Set samples) { if ( samples == null || samples.isEmpty() ) return vc; @@ -246,17 +247,11 @@ public class SelectVariants extends RodWalker { HashMap attributes = new HashMap(sub.getAttributes()); - int alleleCount = 0; - int numberOfAlleles = 0; int depth = 0; for (String sample : sub.getSampleNames()) { Genotype g = sub.getGenotype(sample); if (g.isNotFiltered() && g.isCalled()) { - numberOfAlleles += g.getPloidy(); - - if (g.isHet()) { alleleCount++; } - else if (g.isHomVar()) { alleleCount += 2; } String dp = (String) g.getAttribute("DP"); if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) { @@ -265,13 +260,8 @@ public class SelectVariants extends RodWalker { } } - attributes.put("AC", alleleCount); - attributes.put("AN", numberOfAlleles); - if (numberOfAlleles == 0) { - attributes.put("AF", 0.0); - } else { - attributes.put("AF", ((double) alleleCount) / ((double) numberOfAlleles)); - } + + VariantContextUtils.calculateChromosomeCounts(sub,attributes,false); attributes.put("DP", depth); sub = VariantContext.modifyAttributes(sub, attributes); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java index 16331ac03..924eed830 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/AlleleFrequencyComparison.java @@ -6,11 +6,14 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.report.tags.Analysis; import org.broadinstitute.sting.utils.report.tags.DataPoint; import org.broadinstitute.sting.utils.report.utils.TableType; +import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.util.List; +import java.util.Map; /** * @@ -40,6 +43,12 @@ public class AlleleFrequencyComparison extends VariantEvaluator { if ( ! (isValidVC(eval) && isValidVC(comp)) ) { return null; } else { + if ( missingField(eval) ) { + recalculateCounts(eval); + } + if ( missingField(comp) ) { + recalculateCounts(comp); + } afTable.update(getAF(eval),getAF(comp)); acTable.update(getAC(eval),getAC(comp)); } @@ -47,16 +56,64 @@ public class AlleleFrequencyComparison extends VariantEvaluator { return null; // there is nothing interesting } + private static boolean missingField(final VariantContext vc) { + return ! ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); + } + + private static void recalculateCounts(VariantContext vc) { + Map attributes = vc.getAttributes(); + VariantContextUtils.calculateChromosomeCounts(vc,attributes,false); + VariantContext.modifyAttributes(vc,attributes); + } + private static boolean isValidVC(final VariantContext vc) { - return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); + return (vc != null && !vc.isFiltered() && vc.getAlternateAlleles().size() == 1); } private static double getAF(VariantContext vc) { - return ((List) vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)).get(0); + Object af = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); + if ( List.class.isAssignableFrom(af.getClass())) { + return ( (List) af ).get(0); + } else if ( String.class.isAssignableFrom(af.getClass())) { + // two possibilities + String s = (String) af; + try { + if ( s.startsWith("[") ) { + return Double.parseDouble(s.replace("\\[","").replace("\\]","")); + } else { + return Double.parseDouble(s); + } + } catch (NumberFormatException e) { + throw new UserException("Allele frequency field may be improperly formatted, found AF="+s); + } + } else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) { + return (Double) af; + } else { + throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",af.toString(),af.getClass())); + } } private static int getAC(VariantContext vc) { - return ((List) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0); + Object ac = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); + if ( List.class.isAssignableFrom(ac.getClass())) { + return ( (List) ac ).get(0); + } else if ( String.class.isAssignableFrom(ac.getClass())) { + // two possibilities + String s = (String) ac; + try { + if ( s.startsWith("[") ) { + return Integer.parseInt(s.replace("\\[","").replace("\\]","")); + } else { + return Integer.parseInt(s); + } + } catch (NumberFormatException e) { + throw new UserException("Allele count field may be improperly formatted, found AC="+s); + } + } else if ( Double.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY).getClass())) { + return (Integer) ac; + } else { + throw new UserException(String.format("Class of Allele Frequency does not appear to be formated, had AF=%s, of class %s",ac.toString(),ac.getClass())); + } } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java new file mode 100755 index 000000000..2bb238c4d --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/FixRefBases.java @@ -0,0 +1,174 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; + +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFWriter; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Oct 16, 2010 + * Time: 8:20:41 PM + * To change this template use File | Settings | File Templates. + */ +@Requires(value={},referenceMetaData=@RMD(name="variant", type= VariantContext.class)) +public class FixRefBases extends RodWalker { + @Output(doc="output file to write to",required=true) + VCFWriter out; + + public void initialize() { + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); + headerLines.add(new VCFHeaderLine("source", "SelectVariants")); + out.writeHeader(new VCFHeader(headerLines)); + } + + public Integer reduceInit() { + return 0; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker != null && tracker.hasROD("variant") ) { + VariantContext vc = tracker.getVariantContext(ref,"variant",null,context.getLocation(),true); + VariantContext newContext = null; + if ( vc.isSNP() && ref.getBase() != vc.getReference().getBases()[0] && vc.getReference().length() == 1 ) { + if ( basesAreFlipped(vc,ref) ) { + logger.warn(String.format("Variant context at %s has ref and alt bases flipped according to reference",ref.getLocus().toString())); + newContext = flipBases(vc,ref); + } else { + HashSet newAlleles = new HashSet(vc.getAlternateAlleles()); + Allele refAllele = Allele.create(ref.getBase(),true); + newAlleles.add(refAllele); + newContext = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(), + ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, fixGenotypes(vc.getGenotypes(),refAllele), + vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, + vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes()); + } + + out.add(newContext,ref.getBase()); + + } else { + out.add(vc,ref.getBase()); + } + } + + return 0; + } + + public Integer reduce(Integer map, Integer reduce) { + return map + reduce; + } + + public void onTraversalDone(Integer fReduce) { + logger.info(String.format("Fixed %d records",fReduce)); + } + + private boolean basesAreFlipped(VariantContext vc, ReferenceContext reference) { + for ( Allele a : vc.getAlternateAlleles() ) { + if ( a.getBases().length == 1 && a.getBases()[0] == reference.getBase() ) { + return true; + } + } + return false; + } + + private VariantContext flipBases(VariantContext vc, ReferenceContext ref) { + HashSet newAlleles = new HashSet(vc.getAlleles().size()); + newAlleles.add(Allele.create(ref.getBase(),true)); + newAlleles.add(Allele.create(vc.getReference().getBases()[0],false)); + for ( Allele a : vc.getAlternateAlleles() ) { + if ( a.getBases()[0] != ref.getBase() ) { + newAlleles.add(a); + } + } + + VariantContext newVC = new VariantContext("FixRefBasesVC", ref.getLocus().getContig(), + ref.getLocus().getStart(), ref.getLocus().getStop(), newAlleles, flipGenotypes(vc.getGenotypes(),newAlleles), + vc.hasNegLog10PError() ? 10.0*vc.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, + vc.isFiltered() ? null : vc.getFilters(), vc.getAttributes()); + + Map attribs = new HashMap(newVC.getAttributes()); + VariantContextUtils.calculateChromosomeCounts(newVC,attribs,false); + VariantContext.modifyAttributes(newVC,attribs); + return newVC; + } + + private Map fixGenotypes(Map old, Allele newRef) { + HashMap newGTs = new HashMap(old.size()); + for ( Map.Entry e : old.entrySet() ) { + newGTs.put(e.getKey(),fixGenotype(e.getValue(),newRef)); + } + + return newGTs; + } + + private Genotype fixGenotype(Genotype g, Allele newRef) { + List newAlleles = new ArrayList(g.getAlleles().size()); + for ( Allele a : g.getAlleles() ) { + if ( a.isReference() ) { + newAlleles.add(newRef); + } else { + newAlleles.add(a); + } + } + return new Genotype(g.getSampleName(), + newAlleles,g.hasNegLog10PError() ? g.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, + g.getAttributes().keySet(), g.getAttributes(),g.genotypesArePhased()); + } + + private Map flipGenotypes(Map old, Set newAlleles) { + HashMap newGTs = new HashMap(old.size()); + for ( Map.Entry e : old.entrySet() ) { + newGTs.put(e.getKey(),flipGenotype(e.getValue(),newAlleles)); + } + + return newGTs; + } + + private Genotype flipGenotype(Genotype old, Set newAlleles) { + Allele ref = null; + for ( Allele a : newAlleles ) { + if ( a.isReference() ) { + ref = a; + } + } + if ( ref == null ) { + throw new StingException("No reference allele in variant context with which to flip genotype alleles"); + } + + List newGTAlleles = new ArrayList(old.getAlleles().size()); + + for ( Allele a : old.getAlleles() ) { + if ( ! a.isNoCall() && a.getBases()[0] == ref.getBases()[0] ) { + newGTAlleles.add(ref); + } else { + if ( a.isReference() ) { + newGTAlleles.add(Allele.create(a.getBases(),false)); + } else { + newGTAlleles.add(a); + } + } + } + + return new Genotype(old.getSampleName(), + newGTAlleles,old.hasNegLog10PError() ? old.getNegLog10PError() : VCFConstants.MISSING_QUALITY_v3_DOUBLE, + old.getAttributes().keySet(), old.getAttributes(),old.genotypesArePhased()); + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 37c992a27..6180eb99f 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -16,9 +16,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'AF < 0.2' -B:variant,VCF " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF " + testfile + " -NO_HEADER"), 1, - Arrays.asList("3a15628b5980031c629c0c33e7e60b40") + Arrays.asList("e22db73fd99fd4f28d472d407e40ead5") ); executeTest("testComplexSelection--" + testfile, spec); diff --git a/scala/qscript/chartl/omni_qc.q b/scala/qscript/chartl/omni_qc.q index 39fc1a010..4a04f221b 100755 --- a/scala/qscript/chartl/omni_qc.q +++ b/scala/qscript/chartl/omni_qc.q @@ -1,6 +1,7 @@ import java.io.{FileReader, File, BufferedReader} import net.sf.picard.reference.FastaSequenceFile import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils import org.broadinstitute.sting.gatk.DownsampleType import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model import org.broadinstitute.sting.queue.extensions.gatk._ @@ -10,6 +11,7 @@ import org.broadinstitute.sting.queue.{QException, QScript} import collection.JavaConversions._ import org.broadinstitute.sting.utils.yaml.YamlUtils import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType +import scala.collection.mutable.HashMap class omni_qc extends QScript { qscript => @@ -19,95 +21,279 @@ class omni_qc extends QScript { var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf") var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf") var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf") - var august_calls_vcf = null + var august_calls_EUR = new TaggedFile("/humgen/1kg/processing/release/august/EUR.vcf","vcf") + var august_calls_ASN = new TaggedFile("/humgen/1kg/processing/release/august/ASN.vcf","vcf") + var august_calls_AFR = new TaggedFile("/humgen/1kg/processing/release/august/AFR.vcf","vcf") + var august_calls_EUR_refined = new TaggedFile("/humgen/1kg/processing/release/august/EUR.beagle.vcf.gz","vcf") + var august_calls_ASN_refined = new TaggedFile("/humgen/1kg/processing/release/august/ASN.beagle.vcf.gz","vcf") + var august_calls_AFR_refined = new TaggedFile("/humgen/1kg/processing/release/august/AFR.beagle.vcf.gz","vcf") var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf") var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf") + //var august_calls_other_genotypes = _ // OMNI VCF FILES - var OMNI_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b36.vcf","vcf") - var OMNI_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b37.deduped.vcf.gz","vcf") - var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot_b36.vcf.gz","vcf") + var OMNI_b36_vcf = new TaggedFile("/humgen/illumina/1kg_seq_vcfs/Illumina_HapMap_Omni_2.5_764samples.vcf","vcf") + var OMNI_b37_vcf = new TaggedFile("/broad/shptmp/chartl/Omni_2.5_764_samples.b37.deduped.vcf","vcf") + var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot.b36.vcf","vcf") // INTERVALS var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list" - var pilot1_interval_list: String = _ - var hiseq_interval_list: String = _ - var production_interval_list: String = "20" + var pilot1_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b36_sites.interval.list" + var hiseq_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b36_sites.interval.list" + var production_interval_list: String = "/broad/shptmp/chartl/omni/resources/Omni_b37_sites.chr20.interval.list" + + // REFERENCES + var b36_ref = new File("/humgen/1kg/reference/human_b36_both.fasta") + var b37_ref = new File("/humgen/1kg/reference/human_g1k_v37.fasta") // OTHER - var failed_qc_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/failed_qc_samples.txt") + val analysis_dir = "/broad/shptmp/chartl/omni/" + val resources_dir = analysis_dir + "resources/" + val scratch_dir = analysis_dir + "scratch/" + val eval_dir = analysis_dir + "eval/" + val vcf_dir = analysis_dir + "vcfs/" trait OmniArgs extends CommandLineGATK { - this.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta") this.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") } - protected def sampleFileToString(samFile: File) : List[String] = { - var reader = new BufferedReader(new FileReader(samFile)) - var line: String = "" - var sampleList: List[String] = Nil - line = reader.readLine - while ( line != null ) { - sampleList :+= line - line = reader.readLine - } - - return sampleList - } - - class vcf2bed(b: String) extends CommandLineFunction { + class vcf2bed extends CommandLineFunction { @Input(doc="A VCF file to be put into an interval list") var in_vcf: File = _ @Output(doc="An interval list file to be used with -L") var out_list: File = _ def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/vcf2bed.py %s %s".format(in_vcf.getAbsolutePath,out_list.getAbsolutePath) } - protected def runme(samples: File, intervals: String, compVCF: TaggedFile, outDir: String, base: String) { - var subset = new SelectVariants with OmniArgs - subset.variantVCF = Omni_chip_vcf - subset.analysisName = "Subset_"+base - subset.sample = sampleFileToString(samples) - if ( intervals != null ) { - subset.intervalsString :+= intervals - } - subset.out = new TaggedFile(outDir+base+".subset.vcf","vcf") - - var makeOmniSiteList = new vcf2bed("foo") - makeOmniSiteList.analysisName = "Omni_list_"+base - makeOmniSiteList.in_vcf = subset.out - makeOmniSiteList.out_list = new TaggedFile(outDir+base+".subset.omni.intervals.list","intervals.list") + class GetSampleOverlap extends CommandLineFunction { + @Input(doc="A list of VCF files for which to calculate the sample overlap") var in_vcfs: List[File] = Nil + @Output(doc="A file to which to write the overlapping sample names") var outFile: File = _ - var eval = new VariantEval with OmniArgs - eval.rodBind :+= new RodBind("evalOmni","vcf",subset.out) - eval.rodBind :+= new RodBind("comp"+base,"vcf",compVCF) - eval.intervals = makeOmniSiteList.out_list - eval.evalModule :+= "GenotypeConcordance" - eval.evalModule :+= "SimpleMetricsBySample" - eval.reportLocation = new TaggedFile(outDir+base+".subset.omni.eval","eval") - eval.reportType = Some(VE2TemplateType.R) - eval.analysisName = base+"_Eval" - eval.select ++= List("AC==1","AC==2","AC==3","AC==4","AC==5","AC>5") - eval.selectName ++= List("AC1","AC2","AC3","AC4","AC5","AC6+") - eval.disI = true - eval.outputVCF = outDir+base+".eval.interesting.sites.vcf" + /*def commandLine = "grep #CHR %s | sed 's/.vcf:/\\t/g' | cut -f11- | tr '\\t' '\\n' | sort | uniq -c | awk '$1 == %d' | awk '{print $2}' > %s".format( + in_vcfs.foldLeft[String]("")( (str,f) => if ( str.equals("") ) str + f.getAbsolutePath else str + " " + f.getAbsolutePath), + in_vcfs.size, + outFile.getAbsolutePath + )*/ + def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/getOverlapSamples.py %s %s".format( + in_vcfs.foldLeft[String]("")( (str,f) => if ( str.equals("") ) str + f.getAbsolutePath else str + " " + f.getAbsolutePath), + outFile.getAbsolutePath + ) + } - if ( base.equalsIgnoreCase("production") ) { - subset.variantVCF = Omni_b37_vcf - subset.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_g1k_v37.fasta") - eval.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_g1k_v37.fasta") - eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") - } + class GunzipFile extends CommandLineFunction { + @Input(doc="file to gunzip") var gunzipMe: File = _ + @Output(doc="file to gunzip to") var outFile: File = _ - - - add(subset,makeOmniSiteList,eval) + def commandLine = "gunzip -c %s > %s".format(gunzipMe.getAbsolutePath,outFile.getAbsolutePath) } def script = { - runme(pilot1_overlap_samples,pilot1_interval_list,pilot1_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot1/","pilot1") - runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf_broad,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3") - runme(hiseq_overlap_samples,hiseq_interval_list,hiSeq_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/hiSeq/","NA12878_HiSeq") - runme(production_overlap_samples,production_interval_list,production_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/production/","production") - runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3_release") + + /** Convert other chips to merged VCFs **/ + + //var august_call_other_chips: List[(String,File)] = processAuxiliaryChipData(august_calls_other_genotypes) + + + /** Unzip the pilot 1 VCFs and dump them into the resources directory **/ + + var gunzip_p1_ceu = new GunzipFile + var gunzip_p1_chb = new GunzipFile + var gunzip_p1_yri = new GunzipFile + var gunzip_hiseq = new GunzipFile + var gunzip_ag_eur = new GunzipFile + var gunzip_ag_asn = new GunzipFile + var gunzip_ag_afr = new GunzipFile + + gunzip_p1_ceu.gunzipMe = pilot1_ceu_vcf + gunzip_p1_ceu.outFile = new File(resources_dir+"CEU.low_coverage.genotypes.vcf") + gunzip_p1_chb.gunzipMe = pilot1_chb_vcf + gunzip_p1_chb.outFile = new File(resources_dir+"CHB.low_coverage.genotypes.vcf") + gunzip_p1_yri.gunzipMe = pilot1_yri_vcf + gunzip_p1_yri.outFile = new File(resources_dir+"YRI.low_coverage.genotypes.vcf") + gunzip_hiseq.gunzipMe = hiseq_calls_vcf + gunzip_hiseq.outFile = new File(resources_dir+"HiSeq.b36.vcf") + gunzip_ag_eur.gunzipMe = august_calls_EUR_refined + gunzip_ag_eur.outFile = new File(resources_dir+"EUR.refined.vcf") + gunzip_ag_asn.gunzipMe = august_calls_ASN_refined + gunzip_ag_asn.outFile = new File(resources_dir+"ASN.refined.vcf") + gunzip_ag_afr.gunzipMe = august_calls_AFR_refined + gunzip_ag_afr.outFile = new File(resources_dir+"AFR.refined.vcf") + + add(gunzip_p1_ceu,gunzip_p1_yri,gunzip_p1_chb,gunzip_hiseq,gunzip_ag_eur,gunzip_ag_asn,gunzip_ag_afr) + + /** fix the omni ref bases **/ + var fix_421 = new FixRefBases with OmniArgs + var fix_764 = new FixRefBases with OmniArgs + var fix_764_b37 = new FixRefBases with OmniArgs + + fix_421.variantVCF = OMNI_hapmap_b36_vcf + fix_421.reference_sequence = b36_ref + fix_421.out = new File(vcf_dir+swapExt(OMNI_hapmap_b36_vcf.getName,".vcf",".ref_fixed.vcf")) + fix_764.variantVCF = OMNI_b36_vcf + fix_764.reference_sequence = b36_ref + fix_764.out = new File(vcf_dir+swapExt(OMNI_b36_vcf.getName,".vcf",".ref_fixed.vcf")) + fix_764_b37.variantVCF = OMNI_b37_vcf + fix_764_b37.reference_sequence = b37_ref + fix_764_b37.out = new File(vcf_dir+swapExt(OMNI_b37_vcf.getName,".vcf",".ref_fixed.vcf")) + + add(fix_421,fix_764,fix_764_b37) + + /** Propagate AC/AN annotations to Omni files via variant annotator **/ + var annotate_421 = new VariantAnnotator with OmniArgs + var annotate_764 = new VariantAnnotator with OmniArgs + var annotate_764_b37 = new VariantAnnotator with OmniArgs + + annotate_421.variantVCF = fix_421.out + annotate_421.reference_sequence = b36_ref + annotate_421.annotation :+= "ChromosomeCounts" + annotate_421.out = new File(vcf_dir+swapExt(fix_421.out.getName,".vcf",".annot.vcf")) + annotate_764.variantVCF = fix_764.out + annotate_764.reference_sequence = b36_ref + annotate_764.annotation :+= "ChromosomeCounts" + annotate_764.out = new File(vcf_dir+swapExt(fix_764.out.getName,".vcf",".annot.vcf")) + annotate_764_b37.variantVCF = fix_764_b37.out + annotate_764_b37.reference_sequence = b37_ref + annotate_764_b37.annotation :+= "ChromosomeCounts" + annotate_764_b37.out = new File(vcf_dir+swapExt(fix_764_b37.out.getName,".vcf",".annot.vcf")) + + add(annotate_421,annotate_764,annotate_764_b37) + + /** Eval the omni chip against the various comps **/ + runEval(annotate_764.out,gunzip_p1_ceu.outFile,"OMNI_764","Pilot1_CEU",pilot1_interval_list, b36_ref) + runEval(annotate_421.out,gunzip_p1_ceu.outFile,"OMNI_421","Pilot1_CEU",pilot1_interval_list, b36_ref) + runEval(annotate_764.out,gunzip_p1_chb.outFile,"OMNI_764","Pilot1_CHB",pilot1_interval_list, b36_ref) + runEval(annotate_421.out,gunzip_p1_chb.outFile,"OMNI_421","Pilot1_CHB",pilot1_interval_list, b36_ref) + runEval(annotate_764.out,gunzip_p1_yri.outFile,"OMNI_764","Pilot1_YRI",pilot1_interval_list, b36_ref) + runEval(annotate_421.out,gunzip_p1_yri.outFile,"OMNI_421","Pilot1_YRI",pilot1_interval_list, b36_ref) + runEval(annotate_764.out,pilot3_release_vcf,"OMNI_764","Pilot3",pilot3_interval_list, b36_ref) + runEval(annotate_421.out,pilot3_release_vcf,"OMNI_421","Pilot3",pilot3_interval_list, b36_ref) + runEval(annotate_764_b37.out,gunzip_ag_eur.outFile,"OMNI_764","August_EUR",production_interval_list, b37_ref) + runEval(annotate_764_b37.out,gunzip_ag_asn.outFile,"OMNI_764","August_ASN",production_interval_list, b37_ref) + runEval(annotate_764_b37.out,gunzip_ag_afr.outFile,"OMNI_764","Ausust_AFR",production_interval_list, b37_ref) + runEval(annotate_764.out,gunzip_hiseq.outFile,"OMNI_764","HiSeq",hiseq_interval_list, b36_ref) + runEval(annotate_764.out,OMNI_hapmap_b36_vcf,"OMNI_764","OMNI_421",pilot1_interval_list,b36_ref) + + var eval1KG_exclude = new VariantEval with OmniArgs + eval1KG_exclude.samples :+= "/broad/shptmp/chartl/omni/scratch/OMNI_764_vs_Pilot3.sample_overlap.exclude.mixups.txt" + eval1KG_exclude.rodBind :+= new RodBind("evalOMNI_764","VCF",annotate_764.out) + eval1KG_exclude.rodBind :+= new RodBind("compPilot3","VCF",pilot3_release_vcf) + eval1KG_exclude.evalModule :+= "GenotypeConcordance" + eval1KG_exclude.evalModule :+= "SimpleMetricsBySample" + eval1KG_exclude.reference_sequence = b36_ref + eval1KG_exclude.reportType = Some(VE2TemplateType.CSV) + eval1KG_exclude.intervalsString :+= pilot3_interval_list + eval1KG_exclude.out = new File(eval_dir+"%s_vs_%s.%s".format("OMNI_764","Pilot3","exclude.mixups.eval.csv")) + + add(eval1KG_exclude) + + runAFComparison(annotate_764.out, gunzip_p1_ceu.outFile, gunzip_p1_chb.outFile, gunzip_p1_yri.outFile) + + } + + def processAuxiliaryChipData(otherChips: File) : List[(String,File)] = { + // todo ==== me + return Nil + } + + def runEval(eval: File, comp: File, eBase: String, cBase: String, intervals: String, reference: File) = { + var base = "%s_vs_%s".format(eBase,cBase) + var getOverlap = new GetSampleOverlap + getOverlap.in_vcfs :+= eval + getOverlap.in_vcfs :+= comp + getOverlap.outFile = new File(scratch_dir+base+".sample_overlap.txt") + add(getOverlap) + + var vEval = new VariantEval with OmniArgs + vEval.samples :+= getOverlap.outFile.getAbsolutePath + vEval.rodBind :+= new RodBind("eval"+eBase,"VCF",eval) + vEval.rodBind :+= new RodBind("comp"+cBase,"VCF",comp) + vEval.evalModule :+= "GenotypeConcordance" + vEval.evalModule :+= "SimpleMetricsBySample" + vEval.intervalsString :+= intervals + vEval.reference_sequence = reference + vEval.reportType = Some(VE2TemplateType.CSV) + + vEval.out = new File(eval_dir+base+".eval.csv") + + add(vEval) + + } + + def swapExt(s: String, d: String, f: String) : String = { + return s.stripSuffix(d)+f + } + + def runAFComparison(omni: File, p1ceu: File, p1asn: File, p1yri:File ) : Boolean = { + // step one, set up some of the info + var populations : List[String] = Nil // these are the pilot 1 populations + populations :+= "CEU" + populations :+= "CHBJPT" + populations :+= "YRI" + var panels : List[String] = Nil // these are the analysis panels + panels :+= "EUR" + panels :+= "ASN" + panels :+= "ASW" + panels :+= "AFR" + panels :+= "ADM" + // step two -- subset the OMNI chip to the actual sample names + var nameToSubset: HashMap[String,SelectVariants] = new HashMap[String,SelectVariants] + for ( p <- populations ) { + nameToSubset += p -> sampleSubset(p,omni) + } + + for ( p <- panels ) { + nameToSubset += p -> sampleSubset(p,omni) + } + + // step three -- compare the pilot 1 files against all populations and panels + + runComps("Pilot1CEU",p1ceu,"CEU",nameToSubset("CEU").out) + runComps("Pilot1CEU",p1ceu,"EUR",nameToSubset("EUR").out) + runComps("Pilot1CHBJPT",p1asn,"CHBJPT",nameToSubset("CHBJPT").out) + runComps("Pilot1CHBJPT",p1asn,"ASN",nameToSubset("ASN").out) + runComps("Pilot1YRI",p1yri,"YRI",nameToSubset("YRI").out) + runComps("Pilot1YRI",p1yri,"AFR",nameToSubset("AFR").out) + runComps("EUR",nameToSubset("EUR").out,"AFR",nameToSubset("AFR").out) + runComps("EUR",nameToSubset("EUR").out,"ASN",nameToSubset("ASN").out) + runComps("EUR",nameToSubset("EUR").out,"ASW",nameToSubset("ASW").out) + runComps("EUR",nameToSubset("EUR").out,"AMR",nameToSubset("ADM").out) + + return true + + } + + def getOmniSampleListByPanel(panel: String) : String = { + return scratch_dir+"OMNI_764_%s.txt".format(panel) + } + + def sampleSubset(panel: String, omni: File) : SelectVariants = { + var sv : SelectVariants = new SelectVariants with OmniArgs + sv.reference_sequence = b36_ref + sv.variantVCF = omni + sv.sample :+= getOmniSampleListByPanel(panel) + sv.out = new File(vcf_dir+swapExt(omni.getName,".vcf",".%s.vcf".format(panel))) + add(sv) + return sv + } + + def runComps(eBase: String, evalVCF: File, cBase: String, compVCF: File) = { + var eval: VariantEval = new VariantEval with OmniArgs + eval.reference_sequence = b36_ref + eval.rodBind :+= new RodBind("eval%s".format(eBase),"VCF",evalVCF) + eval.rodBind :+= new RodBind("comp%s".format(cBase),"VCF",compVCF) + eval.noStandard = true + eval.E :+= "AlleleFrequencyComparison" + + add(eval) + + var combine: CombineVariants = new CombineVariants with OmniArgs + combine.reference_sequence = b36_ref + combine.rodBind :+= new RodBind(eBase,"VCF",evalVCF) + combine.rodBind :+= new RodBind(cBase,"VCF",compVCF) + combine.out = new File(vcf_dir+"%s_plus_%s.vcf".format(eBase,cBase)) + combine.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY) + combine.priority = "%s,%s".format(eBase,cBase) + + add(combine) + } }